[llvm-branch-commits] [llvm] [AMDGPU][gfx1250] Implement SIMemoryLegalizer (PR #154726)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Aug 22 01:13:13 PDT 2025
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/154726
>From 7c8791184ddd16d0e6049a08d5f5a8f7fb5c429f Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Aug 2025 12:14:21 +0200
Subject: [PATCH 1/4] [AMDGPU][gfx1250] Implement SIMemoryLegalizer
Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model.
Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 +
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 73 +-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 5 +
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 22 +
.../CodeGen/AMDGPU/atomics-system-scope.ll | 8 +
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 292 +-
.../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 25 +
.../memory-legalizer-fence-mmra-global.ll | 242 +-
.../CodeGen/AMDGPU/memory-legalizer-fence.ll | 246 +-
.../AMDGPU/memory-legalizer-flat-agent.ll | 2964 ++++++++--------
.../AMDGPU/memory-legalizer-flat-lastuse.ll | 22 +-
.../memory-legalizer-flat-nontemporal.ll | 22 +-
.../memory-legalizer-flat-singlethread.ll | 2220 ++++++------
.../AMDGPU/memory-legalizer-flat-system.ll | 3022 ++++++++---------
.../AMDGPU/memory-legalizer-flat-volatile.ll | 74 +-
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 2189 ++++++------
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 2330 +++++++------
.../AMDGPU/memory-legalizer-global-agent.ll | 2888 ++++++++--------
.../AMDGPU/memory-legalizer-global-lastuse.ll | 22 +-
.../memory-legalizer-global-nontemporal.ll | 22 +-
.../memory-legalizer-global-singlethread.ll | 2220 ++++++------
.../AMDGPU/memory-legalizer-global-system.ll | 2789 ++++++++-------
.../memory-legalizer-global-volatile.ll | 74 +-
.../memory-legalizer-global-wavefront.ll | 2220 ++++++------
.../memory-legalizer-global-workgroup.ll | 2442 +++++++------
.../AMDGPU/memory-legalizer-local-agent.ll | 824 ++---
.../AMDGPU/memory-legalizer-local-system.ll | 824 ++---
.../AMDGPU/memory-legalizer-local-volatile.ll | 21 +-
.../memory-legalizer-local-workgroup.ll | 824 ++---
30 files changed, 14856 insertions(+), 14076 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 2a8385df3f934..e416c91ba52d2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1831,6 +1831,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasScratchBaseForwardingHazard() const {
return GFX1250Insts && getGeneration() == GFX12;
}
+
+ /// \returns true if the subtarget requires a wait for xcnt before atomic
+ /// flat/global stores & rmw.
+ bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f7c7bb509c9ef..bcf8a86effe5a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1051,6 +1051,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return AMDGPU::S_WAIT_DSCNT;
case AMDGPU::S_WAIT_KMCNT_soft:
return AMDGPU::S_WAIT_KMCNT;
+ case AMDGPU::S_WAIT_XCNT_soft:
+ return AMDGPU::S_WAIT_XCNT;
default:
return Opcode;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 53f554eccb1fb..5d1ea29ae6c0d 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -587,7 +587,11 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
public:
- SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+ // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
+ // the behavior is the same if assuming GFX12.0 in CU mode.
+ assert(ST.hasGFX1250Insts() ? ST.isCuModeEnabled() : true);
+ }
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2340,12 +2344,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
STORECnt |= true;
break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to wait for operations to complete to ensure
- // they are visible to waves in the other CU as the L0 is per CU.
- // Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
- if (!ST.isCuModeEnabled()) {
+ // GFX12.0:
+ // In WGP mode the waves of a work-group can be executing on either CU
+ // of the WGP. Therefore need to wait for operations to complete to
+ // ensure they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ //
+ // GFX12.5:
+ // TODO DOCS
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2366,7 +2374,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
@@ -2397,7 +2405,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
//
// This also applies to fences. Fences cannot pair with an instruction
// tracked with bvh/samplecnt as we don't have any atomics that do that.
- if (Order != AtomicOrdering::Acquire) {
+ if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
}
@@ -2449,10 +2457,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore we need to invalidate the L0 which is per CU.
- // Otherwise in CU mode all waves of a work-group are on the same CU, and so
- // the L0 does not need to be invalidated.
+ // GFX12.0:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore we need to invalidate the L0 which is per CU.
+ // Otherwise in CU mode all waves of a work-group are on the same CU, and
+ // so the L0 does not need to be invalidated.
+ //
+ // GFX12.5
+ // TODO DOCS
if (ST.isCuModeEnabled())
return false;
@@ -2497,7 +2509,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- // global_wb is only necessary at system scope for gfx120x targets.
+ // global_wb is only necessary at system scope for GFX12.0,
+ // they're also necessary at device scope for GFX12.5.
//
// Emitting it for lower scopes is a slow no-op, so we omit it
// for performance.
@@ -2507,6 +2520,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
.addImm(AMDGPU::CPol::SCOPE_SYS);
break;
case SIAtomicScope::AGENT:
+ // TODO DOCS
+ if (ST.hasGFX1250Insts()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+ .addImm(AMDGPU::CPol::SCOPE_DEV);
+ }
+ break;
case SIAtomicScope::WORKGROUP:
// No WB necessary, but we still have to wait.
break;
@@ -2569,17 +2588,31 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
}
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
- MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
- if (!CPol)
- return false;
+ assert(MI.mayStore() && "Not a Store inst");
+ const bool IsRMW = (MI.mayLoad() && MI.mayStore());
+ bool Changed = false;
+
+ // GFX12.5 only: xcnt wait is needed before flat and global atomics stores/rmw
+ if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ // Remaining fixes do not apply to RMWs
+ if (IsRMW)
+ return Changed;
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ if (!CPol) // Some vmem operations do not have a scope and are not concerned.
+ return Changed;
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
// GFX12.0 only: Extra waits needed before system scope stores.
if (!ST.hasGFX1250Insts()) {
if (!Atomic && Scope == CPol::SCOPE_SYS)
return insertWaitsBeforeSystemScopeStore(MI);
- return false;
+ return Changed;
}
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
@@ -2589,7 +2622,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
(!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
return setScope(MI, CPol::SCOPE_SE);
- return false;
+ return Changed;
}
bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
@@ -2778,6 +2811,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ MachineInstr &RMWMI = *MI;
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
@@ -2812,6 +2846,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Position::AFTER);
}
+ Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index a003a46191a87..8012e9e6bc9bc 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1656,6 +1656,11 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+
+let SubtargetPredicate = HasWaitXcnt in {
+ def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
+}
+
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 481a2540eacb7..e886ea4fc6ac6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
index 5fc9f4a0f8038..4bb2a13d02cc7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
@@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1344,6 +1348,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1386,6 +1391,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1428,6 +1434,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1470,6 +1477,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 004d3c0c1cf53..9e3348bbfdef6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -7,6 +7,8 @@
define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -21,6 +23,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -36,6 +40,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i
define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -50,6 +56,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase
define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -65,6 +73,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -81,6 +91,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff
define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -109,6 +121,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -123,6 +137,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -145,6 +161,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -159,6 +177,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -182,6 +202,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset,
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -195,6 +217,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset,
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -215,6 +239,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -228,6 +254,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -271,6 +299,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB10_5
; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -317,6 +347,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB10_5
; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -371,6 +403,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB11_5
; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -420,6 +454,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB11_5
; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -469,6 +505,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -506,6 +544,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -551,6 +591,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -591,6 +633,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -621,6 +665,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_add_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -636,6 +682,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -652,6 +700,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_add_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -665,6 +715,8 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -699,6 +751,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB18_5
; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -745,6 +799,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB18_5
; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -799,6 +855,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB19_5
; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -848,6 +906,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB19_5
; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -897,6 +957,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -937,6 +999,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -985,6 +1049,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1028,6 +1094,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1061,6 +1129,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_sub_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1076,6 +1146,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1092,6 +1164,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_sub_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1105,6 +1179,8 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1139,6 +1215,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB26_5
; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1185,6 +1263,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB26_5
; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1239,6 +1319,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB27_5
; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1288,6 +1370,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB27_5
; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1337,6 +1421,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1377,6 +1463,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1425,6 +1513,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1468,6 +1558,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1501,6 +1593,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_and_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1516,6 +1610,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1532,6 +1628,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_and_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1545,6 +1643,8 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1579,6 +1679,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB34_5
; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1626,6 +1728,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB34_5
; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1681,6 +1785,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB35_5
; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1731,6 +1837,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB35_5
; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1781,6 +1889,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1822,6 +1932,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1871,6 +1983,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1915,6 +2029,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1949,6 +2065,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_or_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1964,6 +2082,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3
define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1980,6 +2100,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff
define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_or_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1993,6 +2115,8 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2027,6 +2151,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB42_5
; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2074,6 +2200,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB42_5
; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2129,6 +2257,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB43_5
; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2179,6 +2309,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB43_5
; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2229,6 +2361,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2270,6 +2404,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2319,6 +2455,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2363,6 +2501,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2397,6 +2537,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xor_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2412,6 +2554,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2428,6 +2572,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xor_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2441,6 +2587,8 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2475,6 +2623,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB50_5
; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2522,6 +2672,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB50_5
; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2577,6 +2729,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB51_5
; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2627,6 +2781,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB51_5
; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2677,6 +2833,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2718,6 +2876,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2767,6 +2927,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2811,6 +2973,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2873,7 +3037,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-LABEL: flat_max_saddr_i32_nortn:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -2885,7 +3049,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -2914,20 +3078,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB58_5
; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -2961,20 +3122,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB58_5
; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -3016,20 +3174,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB59_5
; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -3066,20 +3221,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB59_5
; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -3120,9 +3272,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2
@@ -3159,9 +3311,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2
@@ -3206,9 +3358,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2
@@ -3248,9 +3400,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2
@@ -3307,7 +3459,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-LABEL: flat_min_saddr_i32_nortn:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -3319,7 +3471,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -3348,20 +3500,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB66_5
; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -3395,20 +3544,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB66_5
; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -3450,20 +3596,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB67_5
; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -3500,20 +3643,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB67_5
; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -3554,9 +3694,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2
@@ -3593,9 +3733,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2
@@ -3640,9 +3780,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2
@@ -3682,9 +3822,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2
@@ -3741,7 +3881,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-LABEL: flat_umax_saddr_i32_nortn:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -3753,7 +3893,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -3782,20 +3922,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB74_5
; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -3829,20 +3966,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB74_5
; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -3884,20 +4018,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB75_5
; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -3934,20 +4065,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB75_5
; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -3988,9 +4116,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2
@@ -4027,9 +4155,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2
@@ -4074,9 +4202,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2
@@ -4116,9 +4244,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2
@@ -4175,7 +4303,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-LABEL: flat_umin_saddr_i32_nortn:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -4187,7 +4315,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -4216,20 +4344,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB82_5
; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -4263,20 +4388,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB82_5
; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -4318,20 +4440,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB83_5
; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -4368,20 +4487,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB83_5
; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -4422,9 +4538,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2
@@ -4461,9 +4577,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2
@@ -4508,9 +4624,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2
@@ -4550,9 +4666,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 0cb2b0b7df3d2..c68871e7c84fa 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1473,6 +1473,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1513,6 +1514,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1557,6 +1561,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1597,6 +1602,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1673,6 +1681,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -1765,6 +1774,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1809,6 +1821,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1849,6 +1862,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1893,6 +1909,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1969,6 +1986,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2063,6 +2081,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2136,6 +2157,7 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2275,6 +2297,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2307,6 +2330,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2339,6 +2363,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 97d52d5f1f26d..209775314a505 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -80,9 +80,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_acquire_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_acquire_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -151,9 +153,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -227,9 +231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -303,9 +309,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -377,9 +385,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_acquire_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -448,9 +458,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -524,9 +536,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -600,9 +614,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -785,13 +801,12 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -891,14 +906,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -998,14 +1012,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1188,13 +1201,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1294,14 +1306,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1401,14 +1412,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1597,14 +1607,12 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1710,15 +1718,13 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1824,15 +1830,13 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2021,14 +2025,12 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2134,15 +2136,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2248,15 +2248,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index b3f6533d43887..07db15ee8e60e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1064,10 +1064,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_acquire_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_acquire_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@@ -1144,10 +1145,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@@ -1229,10 +1231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@@ -1314,10 +1317,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@@ -1389,9 +1393,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_acquire_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@@ -1460,9 +1466,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@@ -1536,9 +1544,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@@ -1612,9 +1622,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
@@ -1797,13 +1809,12 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") release
ret void
@@ -1903,14 +1914,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel
ret void
@@ -2010,14 +2020,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst
ret void
@@ -2200,13 +2209,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release
ret void
@@ -2306,14 +2314,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel
ret void
@@ -2413,14 +2420,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst
ret void
@@ -2609,14 +2615,12 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence release
ret void
@@ -2722,15 +2726,13 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence acq_rel
ret void
@@ -2836,15 +2838,13 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence seq_cst
ret void
@@ -3033,14 +3033,12 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") release
ret void
@@ -3146,15 +3144,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel
ret void
@@ -3260,15 +3256,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 36adbc0011118..fe7fd8522bd6a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -825,23 +825,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4
@@ -993,15 +989,16 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4
@@ -1152,15 +1149,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4
@@ -1335,19 +1333,19 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") release, align 4
@@ -1522,19 +1520,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4
@@ -1685,15 +1683,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic
@@ -1875,17 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2060,19 +2060,19 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release
@@ -2278,21 +2278,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -2498,21 +2498,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -2722,18 +2722,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2972,24 +2973,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -3228,24 +3227,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -3485,19 +3482,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3768,21 +3766,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4046,23 +4045,23 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4357,25 +4356,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4670,25 +4669,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4959,21 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5244,21 +5244,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5553,25 +5554,25 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5866,25 +5867,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6179,25 +6180,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6492,25 +6493,25 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6805,25 +6806,25 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7118,25 +7119,25 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7431,25 +7432,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7744,25 +7745,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8046,21 +8047,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8361,22 +8363,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8686,25 +8689,25 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9033,28 +9036,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9383,28 +9384,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9709,24 +9708,23 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10027,22 +10025,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10371,28 +10370,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10721,28 +10718,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11071,28 +11066,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11421,28 +11414,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11767,26 +11758,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12115,28 +12106,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12465,28 +12454,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12815,28 +12802,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13679,24 +13664,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4
@@ -13848,15 +13829,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4
@@ -14007,15 +13989,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4
@@ -14190,19 +14173,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4
@@ -14377,19 +14360,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4
@@ -14540,15 +14523,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic
@@ -14724,19 +14708,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT: s_endpgm
-;
-; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -14911,19 +14896,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release
@@ -15125,21 +15110,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15341,21 +15326,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15575,19 +15560,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -15836,25 +15822,23 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -16103,25 +16087,23 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -16361,19 +16343,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16640,21 +16623,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16918,23 +16902,23 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17225,25 +17209,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17534,25 +17518,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17819,21 +17803,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18100,21 +18085,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18405,25 +18391,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18714,25 +18700,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19023,25 +19009,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19332,25 +19318,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19641,25 +19627,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19950,25 +19936,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20259,25 +20245,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20568,25 +20554,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20870,21 +20856,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21195,23 +21182,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21521,25 +21509,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21878,29 +21866,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22239,29 +22225,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22576,25 +22560,24 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22905,23 +22888,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23260,29 +23244,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23621,29 +23603,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23982,29 +23962,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24343,29 +24321,27 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24700,27 +24676,27 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25059,29 +25035,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25420,29 +25394,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25781,29 +25753,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index 8d98f532908fe..5ce7db881691c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -108,18 +108,16 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_last_use_and_volatile_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_last_use_and_volatile_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index af48eaf8fcda6..2594cd43b8c11 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -1346,18 +1346,16 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_nontemporal_volatile_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_nontemporal_volatile_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 871c941dd6dca..8a75db2c36dc7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -929,15 +929,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4
@@ -1088,15 +1089,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4
@@ -1247,15 +1249,16 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4
@@ -1406,15 +1409,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4
@@ -1565,15 +1569,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic
@@ -1724,15 +1729,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -1883,15 +1889,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release
@@ -2042,15 +2049,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2201,15 +2209,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -2404,17 +2413,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -2610,17 +2620,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2816,17 +2827,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -3066,19 +3078,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3318,19 +3331,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3570,19 +3584,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3822,19 +3837,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4074,19 +4090,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4326,19 +4343,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4578,19 +4596,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4830,19 +4849,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5082,19 +5102,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5334,19 +5355,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5586,19 +5608,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5838,19 +5861,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6090,19 +6114,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6342,19 +6367,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6594,19 +6620,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6890,21 +6917,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7190,21 +7218,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7490,21 +7519,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7790,21 +7820,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8090,21 +8121,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8390,21 +8422,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8690,21 +8723,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8990,21 +9024,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9290,21 +9325,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9590,21 +9626,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9890,21 +9927,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10190,21 +10228,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10490,21 +10529,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10790,21 +10830,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11090,21 +11131,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12030,15 +12072,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4
@@ -12189,15 +12232,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4
@@ -12348,15 +12392,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4
@@ -12507,15 +12552,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -12666,15 +12712,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12825,15 +12872,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12984,15 +13032,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release
@@ -13143,15 +13192,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13302,15 +13352,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13505,17 +13556,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -13711,17 +13763,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13917,17 +13970,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -14167,19 +14221,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14419,19 +14474,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14671,19 +14727,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14923,19 +14980,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15175,19 +15233,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15427,19 +15486,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15679,19 +15739,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15931,19 +15992,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16183,19 +16245,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16435,19 +16498,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16687,19 +16751,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16939,19 +17004,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17191,19 +17257,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17443,19 +17510,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17695,19 +17763,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17991,21 +18060,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18291,21 +18361,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18591,21 +18662,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18891,21 +18963,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19191,21 +19264,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19491,21 +19565,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19791,21 +19866,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20091,21 +20167,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20391,21 +20468,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20691,21 +20769,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20991,21 +21070,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21291,21 +21371,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21591,21 +21672,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21891,21 +21973,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22191,21 +22274,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 9d70a2437e553..b5ea23d4655b6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -829,23 +829,19 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in seq_cst, align 4
@@ -997,15 +993,16 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out unordered, align 4
@@ -1156,15 +1153,16 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out monotonic, align 4
@@ -1343,20 +1341,19 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out release, align 4
@@ -1535,20 +1532,19 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out seq_cst, align 4
@@ -1699,15 +1695,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in monotonic
@@ -1891,17 +1888,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -2080,20 +2078,19 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in release
@@ -2305,22 +2302,21 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -2532,22 +2528,21 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -2759,18 +2754,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -3015,25 +3011,22 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -3278,25 +3271,22 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -3536,19 +3526,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3821,21 +3812,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4103,24 +4095,23 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4421,26 +4412,25 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4741,26 +4731,25 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5033,21 +5022,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5320,21 +5310,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5635,26 +5626,25 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5955,26 +5945,25 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6275,26 +6264,25 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6595,26 +6583,25 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6915,26 +6902,25 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7235,26 +7221,25 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7555,26 +7540,25 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7875,26 +7859,25 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8178,21 +8161,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8495,22 +8479,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8824,26 +8809,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9178,29 +9162,26 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9535,29 +9516,26 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9864,24 +9842,23 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10184,22 +10161,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10534,29 +10512,26 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10891,29 +10866,26 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11248,29 +11220,26 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11605,29 +11574,26 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11958,27 +11924,26 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12313,29 +12278,26 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12670,29 +12632,26 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13027,29 +12986,26 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13896,24 +13852,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4
@@ -14065,15 +14017,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4
@@ -14224,15 +14177,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4
@@ -14411,20 +14365,19 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") release, align 4
@@ -14603,20 +14556,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4
@@ -14767,15 +14719,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic
@@ -14953,19 +14906,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT: s_endpgm
-;
-; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -15144,20 +15098,19 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release
@@ -15365,22 +15318,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -15588,22 +15540,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -15825,19 +15776,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -16092,26 +16044,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -16366,26 +16315,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -16625,19 +16571,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16906,21 +16853,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17188,24 +17136,23 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17502,26 +17449,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17818,26 +17764,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18106,21 +18051,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18389,21 +18335,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18700,26 +18647,25 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19016,26 +18962,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19332,26 +19277,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19648,26 +19592,25 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19964,26 +19907,25 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20280,26 +20222,25 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20596,26 +20537,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20912,26 +20852,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21215,21 +21154,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21542,23 +21482,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21872,26 +21813,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22236,30 +22176,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22604,30 +22541,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22944,25 +22878,24 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23275,23 +23208,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23636,30 +23570,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24004,30 +23935,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24372,30 +24300,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24740,30 +24665,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25104,28 +25026,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25470,30 +25391,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25838,30 +25756,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -26206,30 +26121,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 43f015c3a2e0f..e917576bd28bc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -145,18 +145,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_nontemporal_load_0:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_nontemporal_load_0:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4
@@ -430,22 +428,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_nontemporal_load_1:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
-; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_nontemporal_load_1:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1160,16 +1156,18 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_volatile_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index f086542b3d1f8..a4804675fd3cf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -929,15 +929,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4
@@ -1088,15 +1089,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4
@@ -1247,15 +1249,16 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4
@@ -1406,15 +1409,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4
@@ -1565,15 +1569,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic
@@ -1724,15 +1729,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -1883,15 +1889,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release
@@ -2042,15 +2049,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2201,15 +2209,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -2404,17 +2413,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -2610,17 +2620,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2816,17 +2827,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -3066,19 +3078,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3318,19 +3331,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3570,19 +3584,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3822,19 +3837,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4074,19 +4090,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4326,19 +4343,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4578,19 +4596,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4830,19 +4849,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5082,19 +5102,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5334,19 +5355,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5586,19 +5608,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5838,19 +5861,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6090,19 +6114,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6342,19 +6367,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6594,19 +6620,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6890,21 +6917,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7190,21 +7218,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7490,21 +7519,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7790,21 +7820,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8090,21 +8121,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8390,21 +8422,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8690,21 +8723,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8990,21 +9024,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9290,21 +9325,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9590,21 +9626,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9890,21 +9927,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10190,21 +10228,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10490,21 +10529,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10790,21 +10830,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11090,21 +11131,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12030,15 +12072,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4
@@ -12189,15 +12232,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4
@@ -12348,15 +12392,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4
@@ -12507,15 +12552,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -12666,15 +12712,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12825,15 +12872,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12984,15 +13032,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release
@@ -13143,15 +13192,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13302,15 +13352,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13505,17 +13556,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -13711,17 +13763,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13917,17 +13970,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -14167,19 +14221,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14419,19 +14474,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14671,19 +14727,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14923,19 +14980,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15175,19 +15233,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15427,19 +15486,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15679,19 +15739,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15931,19 +15992,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16183,19 +16245,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16435,19 +16498,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16687,19 +16751,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16939,19 +17004,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17191,19 +17257,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17443,19 +17510,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17695,19 +17763,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17991,21 +18060,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18291,21 +18361,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18591,21 +18662,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18891,21 +18963,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19191,21 +19264,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19491,21 +19565,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19791,21 +19866,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20091,21 +20167,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20391,21 +20468,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20691,21 +20769,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20991,21 +21070,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21291,21 +21371,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21591,21 +21672,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21891,21 +21973,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index d8e6ad043e061..01801637ce770 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -811,17 +811,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
@@ -973,15 +974,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
@@ -1132,15 +1134,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
@@ -1308,16 +1311,18 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
@@ -1485,16 +1490,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
@@ -1645,15 +1652,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
@@ -1823,16 +1831,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2000,16 +2009,18 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
@@ -2196,17 +2207,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -2393,17 +2406,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -2610,17 +2625,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2847,18 +2863,20 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -3085,18 +3103,20 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -3336,19 +3356,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3607,20 +3628,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3877,20 +3899,22 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4166,21 +4190,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4456,21 +4482,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4729,20 +4757,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5001,20 +5030,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5290,21 +5320,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5580,21 +5612,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5870,21 +5904,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6160,21 +6196,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6458,21 +6496,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6770,21 +6809,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7087,22 +7127,24 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7419,22 +7461,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7751,22 +7795,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8066,21 +8112,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8378,21 +8425,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8709,22 +8757,24 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9041,22 +9091,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9373,22 +9425,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9705,22 +9759,24 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10035,22 +10091,24 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10367,22 +10425,24 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10699,22 +10759,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11031,22 +11093,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11839,16 +11903,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -12000,15 +12066,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
@@ -12159,15 +12226,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
@@ -12328,15 +12396,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
@@ -12497,15 +12568,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12656,15 +12730,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -12825,15 +12900,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -12994,15 +13071,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
@@ -13173,15 +13253,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13352,15 +13436,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13563,17 +13651,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13789,17 +13878,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -14015,17 +14107,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -14265,19 +14360,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14527,19 +14623,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14789,19 +14887,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15061,19 +15162,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15333,19 +15438,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15595,19 +15704,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15857,19 +15968,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16129,19 +16242,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16401,19 +16518,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16673,19 +16794,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16945,19 +17070,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17217,19 +17346,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17489,19 +17622,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17761,19 +17898,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18033,19 +18174,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18329,21 +18474,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18637,21 +18783,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18947,21 +19094,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19267,21 +19417,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19587,21 +19740,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19897,21 +20053,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20205,21 +20362,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20525,21 +20683,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20845,21 +21006,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21165,21 +21329,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21485,21 +21652,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21803,21 +21973,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22123,21 +22296,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22443,21 +22619,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22763,21 +22942,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 184e15406bfbc..ad163cefe57d4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -829,23 +829,19 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -1004,15 +1000,16 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4
@@ -1170,15 +1167,16 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4
@@ -1361,19 +1359,19 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4
@@ -1556,19 +1554,19 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4
@@ -1724,15 +1722,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic
@@ -1917,17 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -2108,19 +2108,19 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release
@@ -2330,21 +2330,21 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -2554,21 +2554,21 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -2768,18 +2768,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -3009,24 +3010,22 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -3256,24 +3255,22 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -3494,19 +3491,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3756,21 +3754,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4016,23 +4015,23 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4307,25 +4306,25 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4600,25 +4599,25 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4868,21 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5132,21 +5132,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5421,25 +5422,25 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5714,25 +5715,25 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6007,25 +6008,25 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6300,25 +6301,25 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6593,25 +6594,25 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6886,25 +6887,25 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7179,25 +7180,25 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7472,25 +7473,25 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7741,21 +7742,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8024,22 +8026,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8317,25 +8320,25 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8633,28 +8636,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8952,28 +8953,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9246,24 +9245,23 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9532,22 +9530,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9845,28 +9844,26 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10164,28 +10161,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10483,28 +10478,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10802,28 +10795,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11117,26 +11108,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11434,28 +11425,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11753,28 +11742,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12072,28 +12059,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12918,23 +12903,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
@@ -13093,15 +13074,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4
@@ -13259,15 +13241,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4
@@ -13450,19 +13433,19 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4
@@ -13645,19 +13628,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4
@@ -13813,15 +13796,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic
@@ -14006,17 +13990,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -14197,19 +14182,19 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release
@@ -14419,21 +14404,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -14643,21 +14628,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -14857,18 +14842,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -15098,24 +15084,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15345,24 +15329,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15583,19 +15565,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15845,21 +15828,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16105,23 +16089,23 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16396,25 +16380,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16689,25 +16673,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16957,21 +16941,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17221,21 +17206,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17510,25 +17496,25 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17803,25 +17789,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18096,25 +18082,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18389,25 +18375,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18682,25 +18668,25 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18975,25 +18961,25 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19268,25 +19254,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19561,25 +19547,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19830,21 +19816,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20113,22 +20100,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20426,28 +20414,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20745,28 +20731,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21039,24 +21023,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21325,22 +21308,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21638,28 +21622,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21957,28 +21939,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22276,28 +22256,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22595,28 +22573,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22910,26 +22886,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23227,28 +23203,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23546,28 +23520,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23865,28 +23837,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index ed2d62356f8f2..bda702156905a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -87,18 +87,16 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_last_use_and_volatile_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_last_use_and_volatile_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 0ad64f5599fe7..8c1c57a1658ec 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -1111,18 +1111,16 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_nontemporal_volatile_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_nontemporal_volatile_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 6a5a6e01c741b..4f2ea4493560f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -945,15 +945,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4
@@ -1111,15 +1112,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4
@@ -1277,15 +1279,16 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4
@@ -1443,15 +1446,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4
@@ -1607,15 +1611,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic
@@ -1771,15 +1776,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -1935,15 +1941,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release
@@ -2099,15 +2106,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2263,15 +2271,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -2455,17 +2464,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -2650,17 +2660,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2845,17 +2856,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -3076,19 +3088,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3309,19 +3322,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3542,19 +3556,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3775,19 +3790,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4008,19 +4024,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4241,19 +4258,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4474,19 +4492,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4707,19 +4726,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4940,19 +4960,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5173,19 +5194,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5406,19 +5428,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5639,19 +5662,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5872,19 +5896,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6105,19 +6130,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6338,19 +6364,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6601,21 +6628,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6868,21 +6896,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7135,21 +7164,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7402,21 +7432,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7669,21 +7700,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7936,21 +7968,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8203,21 +8236,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8470,21 +8504,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8737,21 +8772,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9004,21 +9040,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9271,21 +9308,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9538,21 +9576,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9805,21 +9844,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10072,21 +10112,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10339,21 +10380,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11294,15 +11336,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4
@@ -11460,15 +11503,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4
@@ -11626,15 +11670,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4
@@ -11792,15 +11837,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -11956,15 +12002,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12120,15 +12167,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12284,15 +12332,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release
@@ -12448,15 +12497,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -12612,15 +12662,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -12804,17 +12855,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12999,17 +13051,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13194,17 +13247,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13425,19 +13479,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13658,19 +13713,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13891,19 +13947,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14124,19 +14181,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14357,19 +14415,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14590,19 +14649,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14823,19 +14883,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15056,19 +15117,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15289,19 +15351,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15522,19 +15585,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15755,19 +15819,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15988,19 +16053,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16221,19 +16287,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16454,19 +16521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16687,19 +16755,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16950,21 +17019,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17217,21 +17287,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17484,21 +17555,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17751,21 +17823,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18018,21 +18091,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18285,21 +18359,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18552,21 +18627,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18819,21 +18895,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19086,21 +19163,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19353,21 +19431,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19620,21 +19699,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19887,21 +19967,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20154,21 +20235,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20421,21 +20503,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20688,21 +20771,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 7ddd515830e11..c8a45deccb462 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -833,23 +833,19 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
@@ -1008,15 +1004,16 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out unordered, align 4
@@ -1174,15 +1171,16 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4
@@ -1369,20 +1367,19 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out release, align 4
@@ -1569,20 +1566,19 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
@@ -1738,15 +1734,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic
@@ -1933,17 +1930,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -2128,20 +2126,19 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release
@@ -2357,22 +2354,21 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -2588,22 +2584,21 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -2805,18 +2800,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -3052,25 +3048,22 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -3306,25 +3299,22 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -3545,19 +3535,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3809,21 +3800,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4073,24 +4065,23 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4371,26 +4362,25 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4671,26 +4661,25 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4942,21 +4931,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5208,21 +5198,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5503,26 +5494,25 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5803,26 +5793,25 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6103,26 +6092,25 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6403,26 +6391,25 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6673,21 +6660,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6958,22 +6946,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7277,29 +7266,26 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7603,29 +7589,26 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7900,24 +7883,23 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8188,22 +8170,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8507,29 +8490,26 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8833,29 +8813,26 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9159,29 +9136,26 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9485,29 +9459,26 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9807,27 +9778,26 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10131,29 +10101,26 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10457,29 +10424,26 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10783,29 +10747,26 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11634,23 +11595,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4
@@ -11809,15 +11766,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4
@@ -11975,15 +11933,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4
@@ -12170,20 +12129,19 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4
@@ -12370,20 +12328,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4
@@ -12539,15 +12496,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic
@@ -12734,17 +12692,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -12929,20 +12888,19 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release
@@ -13158,22 +13116,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -13389,22 +13346,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -13606,18 +13562,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -13853,25 +13810,22 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -14107,25 +14061,22 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -14346,19 +14297,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14610,21 +14562,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14874,24 +14827,23 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15172,26 +15124,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15472,26 +15423,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15743,21 +15693,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16009,21 +15960,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16304,26 +16256,25 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16604,26 +16555,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16904,26 +16854,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17204,26 +17153,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17504,26 +17452,25 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17804,26 +17751,25 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18104,26 +18050,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18404,26 +18349,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18674,21 +18618,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18959,22 +18904,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19256,26 +19202,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19579,29 +19524,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19905,29 +19847,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20202,24 +20141,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20490,22 +20428,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20809,29 +20748,26 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21135,29 +21071,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21461,29 +21394,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21787,29 +21717,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22109,27 +22036,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22433,29 +22359,26 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22759,29 +22682,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23085,29 +23005,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 0d18963cbfb68..52b30c366e617 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -148,18 +148,16 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_volatile_load_0:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_volatile_load_0:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4
@@ -360,22 +358,20 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_volatile_load_1:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
-; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_volatile_load_1:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1040,16 +1036,18 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_volatile_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_volatile_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 1aa8305b1a837..f66e6d00e6eab 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -945,15 +945,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4
@@ -1111,15 +1112,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4
@@ -1277,15 +1279,16 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4
@@ -1443,15 +1446,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4
@@ -1607,15 +1611,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic
@@ -1771,15 +1776,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -1935,15 +1941,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release
@@ -2099,15 +2106,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2263,15 +2271,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -2455,17 +2464,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -2650,17 +2660,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2845,17 +2856,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -3076,19 +3088,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3309,19 +3322,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3542,19 +3556,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3775,19 +3790,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4008,19 +4024,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4241,19 +4258,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4474,19 +4492,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4707,19 +4726,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4940,19 +4960,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5173,19 +5194,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5406,19 +5428,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5639,19 +5662,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5872,19 +5896,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6105,19 +6130,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6338,19 +6364,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6601,21 +6628,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6868,21 +6896,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7135,21 +7164,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7402,21 +7432,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7669,21 +7700,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7936,21 +7968,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8203,21 +8236,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8470,21 +8504,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8737,21 +8772,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9004,21 +9040,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9271,21 +9308,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9538,21 +9576,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9805,21 +9844,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10072,21 +10112,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10339,21 +10380,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11294,15 +11336,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4
@@ -11460,15 +11503,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4
@@ -11626,15 +11670,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4
@@ -11792,15 +11837,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -11956,15 +12002,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12120,15 +12167,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12284,15 +12332,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release
@@ -12448,15 +12497,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -12612,15 +12662,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -12804,17 +12855,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12999,17 +13051,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13194,17 +13247,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13425,19 +13479,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13658,19 +13713,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13891,19 +13947,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14124,19 +14181,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14357,19 +14415,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14590,19 +14649,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14823,19 +14883,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15056,19 +15117,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15289,19 +15351,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15522,19 +15585,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15755,19 +15819,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15988,19 +16053,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16221,19 +16287,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16454,19 +16521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16687,19 +16755,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16950,21 +17019,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17217,21 +17287,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17484,21 +17555,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17751,21 +17823,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18018,21 +18091,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18285,21 +18359,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18552,21 +18627,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18819,21 +18895,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19086,21 +19163,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19353,21 +19431,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19620,21 +19699,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19887,21 +19967,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20154,21 +20235,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20421,21 +20503,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20688,21 +20771,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 3eab16e6b9713..bbbf8cf7f5cb1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -799,17 +799,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4
@@ -968,15 +969,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4
@@ -1134,15 +1136,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4
@@ -1318,16 +1321,18 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
@@ -1503,16 +1508,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4
@@ -1668,15 +1675,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic
@@ -1842,15 +1850,17 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2024,16 +2034,18 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release
@@ -2217,16 +2229,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2410,16 +2425,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2608,17 +2626,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2828,18 +2847,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -3049,18 +3070,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3281,19 +3304,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3524,19 +3548,21 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3775,20 +3801,22 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4037,20 +4065,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4299,20 +4330,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4543,19 +4577,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4786,19 +4822,21 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5047,20 +5085,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5309,20 +5350,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5571,20 +5615,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5833,20 +5880,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6095,20 +6145,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6357,20 +6410,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6619,20 +6675,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6881,20 +6940,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7145,21 +7207,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7417,21 +7480,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7702,22 +7766,24 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7995,22 +8061,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8288,22 +8356,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8563,21 +8633,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8835,21 +8906,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9127,22 +9199,24 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9420,22 +9494,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9713,22 +9789,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10006,22 +10084,24 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10297,22 +10377,24 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10590,22 +10672,24 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10883,22 +10967,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11176,22 +11262,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11982,16 +12070,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -12150,15 +12240,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4
@@ -12316,15 +12407,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -12492,15 +12584,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4
@@ -12668,15 +12763,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12832,15 +12930,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -13006,15 +13105,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13180,15 +13281,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release
@@ -13364,15 +13468,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13548,15 +13656,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13745,17 +13857,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13957,17 +14070,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -14169,17 +14285,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -14400,19 +14519,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14643,19 +14763,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14886,19 +15008,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15139,19 +15264,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15392,19 +15521,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15635,19 +15768,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15878,19 +16013,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16131,19 +16268,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16384,19 +16525,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16637,19 +16782,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16890,19 +17039,23 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17143,19 +17296,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17396,19 +17553,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17649,19 +17810,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17902,19 +18067,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18165,21 +18334,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18437,21 +18607,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18714,21 +18885,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18998,21 +19172,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19282,21 +19459,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19556,21 +19736,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19828,21 +20009,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20112,21 +20294,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20396,21 +20581,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20680,21 +20868,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20964,21 +21155,24 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21246,21 +21440,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21530,21 +21727,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21814,21 +22014,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22098,21 +22301,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 102616b9a2065..7428ddc780675 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -756,18 +756,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_load_b32 v1, v0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4
@@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4
@@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4
@@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release
@@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 1356fe4854170..d57736ba0230c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -756,18 +756,19 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_load_b32 v1, v0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4
@@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out release, align 4
@@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4
@@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release
@@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 75e28f9008e28..d8ba02adf4b35 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -883,16 +883,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_volatile_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_volatile_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6aaf9d323b1fd..7220c071bf657 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -756,18 +756,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_load_b32 v1, v0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4
@@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
@@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4
@@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release
@@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
>From bfd8fe6e001caeba3d6c7f0ff15fc94b52e04846 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Aug 2025 12:35:47 +0200
Subject: [PATCH 2/4] clang-format
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 5d1ea29ae6c0d..27e2ac5372796 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2374,7 +2374,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
>From 73efbb3b139d2906010d6563fa4f8e30dd6edc9e Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Aug 2025 12:44:08 +0200
Subject: [PATCH 3/4] Drop -CU suffix
---
.../memory-legalizer-fence-mmra-global.ll | 240 +-
.../CodeGen/AMDGPU/memory-legalizer-fence.ll | 240 +-
.../AMDGPU/memory-legalizer-flat-agent.ll | 2930 ++++++++---------
.../AMDGPU/memory-legalizer-flat-lastuse.ll | 20 +-
.../memory-legalizer-flat-nontemporal.ll | 20 +-
.../memory-legalizer-flat-singlethread.ll | 2304 ++++++-------
.../AMDGPU/memory-legalizer-flat-system.ll | 2930 ++++++++---------
.../AMDGPU/memory-legalizer-flat-volatile.ll | 72 +-
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 2272 ++++++-------
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 2512 +++++++-------
.../AMDGPU/memory-legalizer-global-agent.ll | 2868 ++++++++--------
.../AMDGPU/memory-legalizer-global-lastuse.ll | 20 +-
.../memory-legalizer-global-nontemporal.ll | 20 +-
.../memory-legalizer-global-singlethread.ll | 2304 ++++++-------
.../AMDGPU/memory-legalizer-global-system.ll | 2706 +++++++--------
.../memory-legalizer-global-volatile.ll | 72 +-
.../memory-legalizer-global-wavefront.ll | 2304 ++++++-------
.../memory-legalizer-global-workgroup.ll | 2648 +++++++--------
.../AMDGPU/memory-legalizer-local-agent.ll | 854 ++---
.../AMDGPU/memory-legalizer-local-system.ll | 854 ++---
.../AMDGPU/memory-legalizer-local-volatile.ll | 22 +-
.../memory-legalizer-local-workgroup.ll | 854 ++---
22 files changed, 14533 insertions(+), 14533 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 209775314a505..6a76f4307dcad 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -80,11 +80,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_acquire_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -153,11 +153,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -231,11 +231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -309,11 +309,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -385,11 +385,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -458,11 +458,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -536,11 +536,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -614,11 +614,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -801,12 +801,12 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -906,13 +906,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1012,13 +1012,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1201,12 +1201,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1306,13 +1306,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1412,13 +1412,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1607,12 +1607,12 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1718,13 +1718,13 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1830,13 +1830,13 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2025,12 +2025,12 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2136,13 +2136,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2248,13 +2248,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 07db15ee8e60e..736a8b58466dd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1064,11 +1064,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_acquire_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@@ -1145,11 +1145,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@@ -1231,11 +1231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@@ -1317,11 +1317,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@@ -1393,11 +1393,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@@ -1466,11 +1466,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@@ -1544,11 +1544,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@@ -1622,11 +1622,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
@@ -1809,12 +1809,12 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") release
ret void
@@ -1914,13 +1914,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel
ret void
@@ -2020,13 +2020,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst
ret void
@@ -2209,12 +2209,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release
ret void
@@ -2314,13 +2314,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel
ret void
@@ -2420,13 +2420,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst
ret void
@@ -2615,12 +2615,12 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence release
ret void
@@ -2726,13 +2726,13 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence acq_rel
ret void
@@ -2838,13 +2838,13 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence seq_cst
ret void
@@ -3033,12 +3033,12 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") release
ret void
@@ -3144,13 +3144,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel
ret void
@@ -3256,13 +3256,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index fe7fd8522bd6a..55ec0c2255f9b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -825,19 +825,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4
@@ -989,16 +989,16 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4
@@ -1149,16 +1149,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4
@@ -1333,19 +1333,19 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") release, align 4
@@ -1520,19 +1520,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4
@@ -1683,16 +1683,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic
@@ -1874,18 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2060,19 +2060,19 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release
@@ -2278,21 +2278,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -2498,21 +2498,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -2722,19 +2722,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2973,22 +2973,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -3227,22 +3227,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -3482,20 +3482,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3766,22 +3766,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4045,23 +4045,23 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4356,25 +4356,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4669,25 +4669,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4958,22 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5244,22 +5244,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5554,25 +5554,25 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5867,25 +5867,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6180,25 +6180,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6493,25 +6493,25 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6806,25 +6806,25 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7119,25 +7119,25 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7432,25 +7432,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7745,25 +7745,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8047,22 +8047,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8363,23 +8363,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8689,25 +8689,25 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9036,26 +9036,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9384,26 +9384,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9708,23 +9708,23 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10025,23 +10025,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10370,26 +10370,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10718,26 +10718,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11066,26 +11066,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11414,26 +11414,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11758,26 +11758,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12106,26 +12106,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12454,26 +12454,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12802,26 +12802,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13664,20 +13664,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4
@@ -13829,16 +13829,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4
@@ -13989,16 +13989,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4
@@ -14173,19 +14173,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4
@@ -14360,19 +14360,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4
@@ -14523,16 +14523,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic
@@ -14710,18 +14710,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -14896,19 +14896,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release
@@ -15110,21 +15110,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15326,21 +15326,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15560,20 +15560,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -15822,23 +15822,23 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -16087,23 +16087,23 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -16343,20 +16343,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16623,22 +16623,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16902,23 +16902,23 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17209,25 +17209,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17518,25 +17518,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17803,22 +17803,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18085,22 +18085,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18391,25 +18391,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18700,25 +18700,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19009,25 +19009,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19318,25 +19318,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19627,25 +19627,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19936,25 +19936,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20245,25 +20245,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20554,25 +20554,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20856,22 +20856,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21182,24 +21182,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21509,25 +21509,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21866,27 +21866,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22225,27 +22225,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22560,24 +22560,24 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22888,24 +22888,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23244,27 +23244,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23603,27 +23603,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23962,27 +23962,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24321,27 +24321,27 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24676,27 +24676,27 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25035,27 +25035,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25394,27 +25394,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25753,27 +25753,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index 5ce7db881691c..0cb5540d9d121 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -108,16 +108,16 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_last_use_and_volatile_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_last_use_and_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 2594cd43b8c11..a130104b904e2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -1346,16 +1346,16 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_nontemporal_volatile_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_nontemporal_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 8a75db2c36dc7..635895259ee32 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -929,16 +929,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4
@@ -1089,16 +1089,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4
@@ -1249,16 +1249,16 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4
@@ -1409,16 +1409,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4
@@ -1569,16 +1569,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic
@@ -1729,16 +1729,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -1889,16 +1889,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release
@@ -2049,16 +2049,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2209,16 +2209,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -2413,18 +2413,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -2620,18 +2620,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2827,18 +2827,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -3078,20 +3078,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3331,20 +3331,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3584,20 +3584,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3837,20 +3837,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4090,20 +4090,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4343,20 +4343,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4596,20 +4596,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4849,20 +4849,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5102,20 +5102,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5355,20 +5355,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5608,20 +5608,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5861,20 +5861,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6114,20 +6114,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6367,20 +6367,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6620,20 +6620,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6917,22 +6917,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7218,22 +7218,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7519,22 +7519,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7820,22 +7820,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8121,22 +8121,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8422,22 +8422,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8723,22 +8723,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9024,22 +9024,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9325,22 +9325,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9626,22 +9626,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9927,22 +9927,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10228,22 +10228,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10529,22 +10529,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10830,22 +10830,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11131,22 +11131,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12072,16 +12072,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4
@@ -12232,16 +12232,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4
@@ -12392,16 +12392,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4
@@ -12552,16 +12552,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -12712,16 +12712,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12872,16 +12872,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -13032,16 +13032,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release
@@ -13192,16 +13192,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13352,16 +13352,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13556,18 +13556,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -13763,18 +13763,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13970,18 +13970,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -14221,20 +14221,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14474,20 +14474,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14727,20 +14727,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14980,20 +14980,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15233,20 +15233,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15486,20 +15486,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15739,20 +15739,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15992,20 +15992,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16245,20 +16245,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16498,20 +16498,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16751,20 +16751,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17004,20 +17004,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17257,20 +17257,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17510,20 +17510,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17763,20 +17763,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18060,22 +18060,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18361,22 +18361,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18662,22 +18662,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18963,22 +18963,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19264,22 +19264,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19565,22 +19565,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19866,22 +19866,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20167,22 +20167,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20468,22 +20468,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20769,22 +20769,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21070,22 +21070,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21371,22 +21371,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21672,22 +21672,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21973,22 +21973,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22274,22 +22274,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index b5ea23d4655b6..e45a8e51c836c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -829,19 +829,19 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in seq_cst, align 4
@@ -993,16 +993,16 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out unordered, align 4
@@ -1153,16 +1153,16 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out monotonic, align 4
@@ -1341,19 +1341,19 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out release, align 4
@@ -1532,19 +1532,19 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out seq_cst, align 4
@@ -1695,16 +1695,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in monotonic
@@ -1888,18 +1888,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -2078,19 +2078,19 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in release
@@ -2302,21 +2302,21 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -2528,21 +2528,21 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -2754,19 +2754,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -3011,22 +3011,22 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -3271,22 +3271,22 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -3526,20 +3526,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3812,22 +3812,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4095,23 +4095,23 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4412,25 +4412,25 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4731,25 +4731,25 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5022,22 +5022,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5310,22 +5310,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5626,25 +5626,25 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5945,25 +5945,25 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6264,25 +6264,25 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6583,25 +6583,25 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6902,25 +6902,25 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7221,25 +7221,25 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7540,25 +7540,25 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7859,25 +7859,25 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8161,22 +8161,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8479,23 +8479,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8809,25 +8809,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9162,26 +9162,26 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9516,26 +9516,26 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9842,23 +9842,23 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10161,23 +10161,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10512,26 +10512,26 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10866,26 +10866,26 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11220,26 +11220,26 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11574,26 +11574,26 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11924,26 +11924,26 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12278,26 +12278,26 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12632,26 +12632,26 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12986,26 +12986,26 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13852,20 +13852,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4
@@ -14017,16 +14017,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4
@@ -14177,16 +14177,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4
@@ -14365,19 +14365,19 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") release, align 4
@@ -14556,19 +14556,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4
@@ -14719,16 +14719,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic
@@ -14908,18 +14908,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -15098,19 +15098,19 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release
@@ -15318,21 +15318,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -15540,21 +15540,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -15776,20 +15776,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -16044,23 +16044,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -16315,23 +16315,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -16571,20 +16571,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16853,22 +16853,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17136,23 +17136,23 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17449,25 +17449,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17764,25 +17764,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18051,22 +18051,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18335,22 +18335,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18647,25 +18647,25 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18962,25 +18962,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19277,25 +19277,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19592,25 +19592,25 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19907,25 +19907,25 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20222,25 +20222,25 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20537,25 +20537,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20852,25 +20852,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21154,22 +21154,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21482,24 +21482,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21813,25 +21813,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22176,27 +22176,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22541,27 +22541,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22878,24 +22878,24 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23208,24 +23208,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23570,27 +23570,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23935,27 +23935,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24300,27 +24300,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24665,27 +24665,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25026,27 +25026,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25391,27 +25391,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25756,27 +25756,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -26121,27 +26121,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index e917576bd28bc..945eb640dad9f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -145,16 +145,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_nontemporal_load_0:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_nontemporal_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4
@@ -428,20 +428,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_nontemporal_load_1:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff
-; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_nontemporal_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1156,18 +1156,18 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index a4804675fd3cf..041b3f51abc2f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -929,16 +929,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4
@@ -1089,16 +1089,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4
@@ -1249,16 +1249,16 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4
@@ -1409,16 +1409,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4
@@ -1569,16 +1569,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic
@@ -1729,16 +1729,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -1889,16 +1889,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release
@@ -2049,16 +2049,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2209,16 +2209,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -2413,18 +2413,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -2620,18 +2620,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2827,18 +2827,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -3078,20 +3078,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3331,20 +3331,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3584,20 +3584,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3837,20 +3837,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4090,20 +4090,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4343,20 +4343,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4596,20 +4596,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4849,20 +4849,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5102,20 +5102,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5355,20 +5355,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5608,20 +5608,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5861,20 +5861,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6114,20 +6114,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6367,20 +6367,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6620,20 +6620,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6917,22 +6917,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7218,22 +7218,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7519,22 +7519,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7820,22 +7820,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8121,22 +8121,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8422,22 +8422,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8723,22 +8723,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9024,22 +9024,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9325,22 +9325,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9626,22 +9626,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9927,22 +9927,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10228,22 +10228,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10529,22 +10529,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10830,22 +10830,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11131,22 +11131,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12072,16 +12072,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4
@@ -12232,16 +12232,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4
@@ -12392,16 +12392,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4
@@ -12552,16 +12552,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -12712,16 +12712,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12872,16 +12872,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -13032,16 +13032,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release
@@ -13192,16 +13192,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13352,16 +13352,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13556,18 +13556,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -13763,18 +13763,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13970,18 +13970,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -14221,20 +14221,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14474,20 +14474,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14727,20 +14727,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14980,20 +14980,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15233,20 +15233,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15486,20 +15486,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15739,20 +15739,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15992,20 +15992,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16245,20 +16245,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16498,20 +16498,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16751,20 +16751,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17004,20 +17004,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17257,20 +17257,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17510,20 +17510,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17763,20 +17763,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18060,22 +18060,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18361,22 +18361,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18662,22 +18662,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18963,22 +18963,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19264,22 +19264,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19565,22 +19565,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19866,22 +19866,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20167,22 +20167,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20468,22 +20468,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20769,22 +20769,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21070,22 +21070,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21371,22 +21371,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21672,22 +21672,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21973,22 +21973,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 01801637ce770..85ecab8128d2f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -811,18 +811,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3]
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
@@ -974,16 +974,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
@@ -1134,16 +1134,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
@@ -1311,18 +1311,18 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
@@ -1490,18 +1490,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
@@ -1652,16 +1652,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
@@ -1831,17 +1831,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2009,18 +2009,18 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
@@ -2207,19 +2207,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -2406,19 +2406,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -2625,18 +2625,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2863,20 +2863,20 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -3103,20 +3103,20 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -3356,20 +3356,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3628,21 +3628,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3899,22 +3899,22 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4190,23 +4190,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4482,23 +4482,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4757,21 +4757,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5030,21 +5030,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5320,23 +5320,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5612,23 +5612,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5904,23 +5904,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6196,23 +6196,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6496,22 +6496,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6809,22 +6809,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7127,24 +7127,24 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7461,24 +7461,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7795,24 +7795,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8112,22 +8112,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8425,22 +8425,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8757,24 +8757,24 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9091,24 +9091,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9425,24 +9425,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9759,24 +9759,24 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10091,24 +10091,24 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10425,24 +10425,24 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10759,24 +10759,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11093,24 +11093,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11903,18 +11903,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3]
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -12066,16 +12066,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
@@ -12226,16 +12226,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
@@ -12396,18 +12396,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
@@ -12568,18 +12568,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12730,16 +12730,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -12900,17 +12900,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13071,18 +13071,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
@@ -13253,19 +13253,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13436,19 +13436,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13651,18 +13651,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13878,20 +13878,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -14107,20 +14107,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -14360,20 +14360,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14623,21 +14623,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14887,22 +14887,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15162,23 +15162,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15438,23 +15438,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15704,21 +15704,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15968,21 +15968,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16242,23 +16242,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16518,23 +16518,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16794,23 +16794,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17070,23 +17070,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17346,23 +17346,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17622,23 +17622,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17898,23 +17898,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18174,23 +18174,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18474,22 +18474,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18783,22 +18783,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19094,24 +19094,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19417,24 +19417,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19740,24 +19740,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20053,22 +20053,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20362,22 +20362,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20683,24 +20683,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21006,24 +21006,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21329,24 +21329,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21652,24 +21652,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21973,24 +21973,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22296,24 +22296,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22619,24 +22619,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22942,24 +22942,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index ad163cefe57d4..5c2d8eb4f5ec0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -829,19 +829,19 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -1000,16 +1000,16 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4
@@ -1167,16 +1167,16 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4
@@ -1359,19 +1359,19 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4
@@ -1554,19 +1554,19 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4
@@ -1722,16 +1722,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic
@@ -1916,18 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -2108,19 +2108,19 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release
@@ -2330,21 +2330,21 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -2554,21 +2554,21 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -2768,19 +2768,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -3010,22 +3010,22 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -3255,22 +3255,22 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -3491,20 +3491,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3754,22 +3754,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4015,23 +4015,23 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4306,25 +4306,25 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4599,25 +4599,25 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4867,22 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5132,22 +5132,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5422,25 +5422,25 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5715,25 +5715,25 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6008,25 +6008,25 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6301,25 +6301,25 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6594,25 +6594,25 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6887,25 +6887,25 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7180,25 +7180,25 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7473,25 +7473,25 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7742,22 +7742,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8026,23 +8026,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8320,25 +8320,25 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8636,26 +8636,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8953,26 +8953,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9245,23 +9245,23 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9530,23 +9530,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9844,26 +9844,26 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10161,26 +10161,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10478,26 +10478,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10795,26 +10795,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11108,26 +11108,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11425,26 +11425,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11742,26 +11742,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12059,26 +12059,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12903,24 +12903,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
-entry:
- %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
- store i32 %val, ptr addrspace(1) %out
- ret void
+; GFX1250-LABEL: global_agent_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
+ store i32 %val, ptr addrspace(1) %out
+ ret void
}
define amdgpu_kernel void @global_agent_one_as_unordered_store(
@@ -13074,16 +13074,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4
@@ -13241,16 +13241,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4
@@ -13433,19 +13433,19 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4
@@ -13628,19 +13628,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4
@@ -13796,16 +13796,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic
@@ -13990,18 +13990,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -14182,19 +14182,19 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release
@@ -14404,21 +14404,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -14628,21 +14628,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -14842,19 +14842,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -15084,22 +15084,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15329,22 +15329,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15565,20 +15565,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15828,22 +15828,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16089,23 +16089,23 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16380,25 +16380,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16673,25 +16673,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16941,22 +16941,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17206,22 +17206,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17496,25 +17496,25 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17789,25 +17789,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18082,25 +18082,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18375,25 +18375,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18668,25 +18668,25 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18961,25 +18961,25 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19254,25 +19254,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19547,25 +19547,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19816,22 +19816,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20100,23 +20100,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20414,26 +20414,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20731,26 +20731,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21023,23 +21023,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21308,23 +21308,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21622,26 +21622,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21939,26 +21939,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22256,26 +22256,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22573,26 +22573,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22886,26 +22886,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23203,26 +23203,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23520,26 +23520,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23837,26 +23837,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index bda702156905a..ca7802d295e0b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -87,16 +87,16 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_last_use_and_volatile_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_last_use_and_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 8c1c57a1658ec..8429af441c062 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -1111,16 +1111,16 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_nontemporal_volatile_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_nontemporal_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 4f2ea4493560f..e7f7b1d196be7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -945,16 +945,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4
@@ -1112,16 +1112,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4
@@ -1279,16 +1279,16 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4
@@ -1446,16 +1446,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4
@@ -1611,16 +1611,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic
@@ -1776,16 +1776,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -1941,16 +1941,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release
@@ -2106,16 +2106,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2271,16 +2271,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -2464,18 +2464,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -2660,18 +2660,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2856,18 +2856,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -3088,20 +3088,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3322,20 +3322,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3556,20 +3556,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3790,20 +3790,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4024,20 +4024,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4258,20 +4258,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4492,20 +4492,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4726,20 +4726,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4960,20 +4960,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5194,20 +5194,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5428,20 +5428,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5662,20 +5662,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5896,20 +5896,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6130,20 +6130,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6364,20 +6364,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6628,22 +6628,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6896,22 +6896,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7164,22 +7164,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7432,22 +7432,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7700,22 +7700,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7968,22 +7968,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8236,22 +8236,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8504,22 +8504,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8772,22 +8772,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9040,22 +9040,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9308,22 +9308,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9576,22 +9576,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9844,22 +9844,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10112,22 +10112,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10380,22 +10380,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11336,16 +11336,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4
@@ -11503,16 +11503,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4
@@ -11670,16 +11670,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4
@@ -11837,16 +11837,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -12002,16 +12002,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12167,16 +12167,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12332,16 +12332,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release
@@ -12497,16 +12497,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -12662,16 +12662,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -12855,18 +12855,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -13051,18 +13051,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13247,18 +13247,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13479,20 +13479,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13713,20 +13713,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13947,20 +13947,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14181,20 +14181,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14415,20 +14415,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14649,20 +14649,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14883,20 +14883,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15117,20 +15117,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15351,20 +15351,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15585,20 +15585,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15819,20 +15819,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16053,20 +16053,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16287,20 +16287,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16521,20 +16521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16755,20 +16755,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17019,22 +17019,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17287,22 +17287,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17555,22 +17555,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17823,22 +17823,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18091,22 +18091,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18359,22 +18359,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18627,22 +18627,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18895,22 +18895,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19163,22 +19163,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19431,22 +19431,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19699,22 +19699,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19967,22 +19967,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20235,22 +20235,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20503,22 +20503,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20771,22 +20771,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index c8a45deccb462..e7880a81800fd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -833,19 +833,19 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
@@ -1004,16 +1004,16 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out unordered, align 4
@@ -1171,16 +1171,16 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4
@@ -1367,19 +1367,19 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out release, align 4
@@ -1566,19 +1566,19 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
@@ -1734,16 +1734,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic
@@ -1930,18 +1930,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -2126,19 +2126,19 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release
@@ -2354,21 +2354,21 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -2584,21 +2584,21 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -2800,19 +2800,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -3048,22 +3048,22 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -3299,22 +3299,22 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -3535,20 +3535,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3800,22 +3800,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4065,23 +4065,23 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4362,25 +4362,25 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4661,25 +4661,25 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4931,22 +4931,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5198,22 +5198,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5494,25 +5494,25 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5793,25 +5793,25 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6092,25 +6092,25 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6391,25 +6391,25 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6660,22 +6660,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6946,23 +6946,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7266,26 +7266,26 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7589,26 +7589,26 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7883,23 +7883,23 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8170,23 +8170,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8490,26 +8490,26 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8813,26 +8813,26 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9136,26 +9136,26 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9459,26 +9459,26 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9778,26 +9778,26 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10101,26 +10101,26 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10424,26 +10424,26 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10747,26 +10747,26 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11595,19 +11595,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4
@@ -11766,16 +11766,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4
@@ -11933,16 +11933,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4
@@ -12129,19 +12129,19 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4
@@ -12328,19 +12328,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4
@@ -12496,16 +12496,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic
@@ -12692,18 +12692,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -12888,19 +12888,19 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release
@@ -13116,21 +13116,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -13346,21 +13346,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -13562,19 +13562,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -13810,22 +13810,22 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -14061,22 +14061,22 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -14297,20 +14297,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14562,22 +14562,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14827,23 +14827,23 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15124,25 +15124,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15423,25 +15423,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15693,22 +15693,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15960,22 +15960,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16256,25 +16256,25 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16555,25 +16555,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16854,25 +16854,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17153,25 +17153,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17452,25 +17452,25 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17751,25 +17751,25 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18050,25 +18050,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18349,25 +18349,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18618,22 +18618,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18904,23 +18904,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19202,25 +19202,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19524,26 +19524,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19847,26 +19847,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20141,23 +20141,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20428,23 +20428,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20748,26 +20748,26 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21071,26 +21071,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21394,26 +21394,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21717,26 +21717,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22036,26 +22036,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22359,26 +22359,26 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22682,26 +22682,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23005,26 +23005,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 52b30c366e617..4641faa9737ce 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -148,16 +148,16 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_volatile_load_0:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_volatile_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4
@@ -358,20 +358,20 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_volatile_load_1:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff
-; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_volatile_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1036,18 +1036,18 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_volatile_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index f66e6d00e6eab..09eb062d876f6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -945,16 +945,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4
@@ -1112,16 +1112,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4
@@ -1279,16 +1279,16 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4
@@ -1446,16 +1446,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4
@@ -1611,16 +1611,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic
@@ -1776,16 +1776,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -1941,16 +1941,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release
@@ -2106,16 +2106,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2271,16 +2271,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -2464,18 +2464,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -2660,18 +2660,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2856,18 +2856,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -3088,20 +3088,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3322,20 +3322,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3556,20 +3556,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3790,20 +3790,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4024,20 +4024,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4258,20 +4258,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4492,20 +4492,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4726,20 +4726,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4960,20 +4960,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5194,20 +5194,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5428,20 +5428,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5662,20 +5662,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5896,20 +5896,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6130,20 +6130,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6364,20 +6364,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6628,22 +6628,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6896,22 +6896,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7164,22 +7164,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7432,22 +7432,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7700,22 +7700,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7968,22 +7968,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8236,22 +8236,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8504,22 +8504,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8772,22 +8772,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9040,22 +9040,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9308,22 +9308,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9576,22 +9576,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9844,22 +9844,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10112,22 +10112,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10380,22 +10380,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11336,16 +11336,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4
@@ -11503,16 +11503,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4
@@ -11670,16 +11670,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4
@@ -11837,16 +11837,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -12002,16 +12002,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12167,16 +12167,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12332,16 +12332,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release
@@ -12497,16 +12497,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -12662,16 +12662,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -12855,18 +12855,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -13051,18 +13051,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13247,18 +13247,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13479,20 +13479,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13713,20 +13713,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13947,20 +13947,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14181,20 +14181,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14415,20 +14415,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14649,20 +14649,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14883,20 +14883,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15117,20 +15117,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15351,20 +15351,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15585,20 +15585,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15819,20 +15819,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16053,20 +16053,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16287,20 +16287,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16521,20 +16521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16755,20 +16755,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17019,22 +17019,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17287,22 +17287,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17555,22 +17555,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17823,22 +17823,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18091,22 +18091,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18359,22 +18359,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18627,22 +18627,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18895,22 +18895,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19163,22 +19163,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19431,22 +19431,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19699,22 +19699,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19967,22 +19967,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20235,22 +20235,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20503,22 +20503,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20771,22 +20771,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index bbbf8cf7f5cb1..885edec03c2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -799,18 +799,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4
@@ -969,16 +969,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4
@@ -1136,16 +1136,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4
@@ -1321,18 +1321,18 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
@@ -1508,18 +1508,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4
@@ -1675,16 +1675,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic
@@ -1850,17 +1850,17 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2034,18 +2034,18 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release
@@ -2229,19 +2229,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2425,19 +2425,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2626,18 +2626,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2847,20 +2847,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -3070,20 +3070,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3304,20 +3304,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3548,21 +3548,21 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3801,22 +3801,22 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4065,23 +4065,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4330,23 +4330,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4577,21 +4577,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4822,21 +4822,21 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5085,23 +5085,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5350,23 +5350,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5615,23 +5615,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5880,23 +5880,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6145,23 +6145,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6410,23 +6410,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6675,23 +6675,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6940,23 +6940,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7207,22 +7207,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7480,22 +7480,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7766,24 +7766,24 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8061,24 +8061,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8356,24 +8356,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8633,22 +8633,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8906,22 +8906,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9199,24 +9199,24 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9494,24 +9494,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9789,24 +9789,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10084,24 +10084,24 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10377,24 +10377,24 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10672,24 +10672,24 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10967,24 +10967,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11262,24 +11262,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12070,18 +12070,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -12240,16 +12240,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4
@@ -12407,16 +12407,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -12584,18 +12584,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4
@@ -12763,18 +12763,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12930,16 +12930,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -13105,17 +13105,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13281,18 +13281,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release
@@ -13468,19 +13468,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13656,19 +13656,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13857,18 +13857,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -14070,20 +14070,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -14285,20 +14285,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -14519,20 +14519,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14763,21 +14763,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15008,22 +15008,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15264,23 +15264,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15521,23 +15521,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15768,21 +15768,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16013,21 +16013,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16268,23 +16268,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16525,23 +16525,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16782,23 +16782,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17039,23 +17039,23 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17296,23 +17296,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17553,23 +17553,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17810,23 +17810,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18067,23 +18067,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18334,22 +18334,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18607,22 +18607,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18885,24 +18885,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19172,24 +19172,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19459,24 +19459,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19736,22 +19736,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20009,22 +20009,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20294,24 +20294,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20581,24 +20581,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20868,24 +20868,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21155,24 +21155,24 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21440,24 +21440,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21727,24 +21727,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22014,24 +22014,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22301,24 +22301,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 7428ddc780675..986b48b60a443 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -756,19 +756,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_load_b32 v1, v0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4
@@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4
@@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4
@@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release
@@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index d57736ba0230c..81bbe0a78203e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -756,19 +756,19 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_load_b32 v1, v0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4
@@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out release, align 4
@@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4
@@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release
@@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index d8ba02adf4b35..980141a87ecf3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -883,17 +883,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_volatile_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 7220c071bf657..6a233a2c9013b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -756,19 +756,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_load_b32 v1, v0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4
@@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
@@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4
@@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release
@@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
>From 287f75ea8240a6318ad982cd55e765846a673cef Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 22 Aug 2025 10:12:03 +0200
Subject: [PATCH 4/4] Comments
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 7 ++++---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +-
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 27e2ac5372796..00f0c4b9eb8e2 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -590,7 +590,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
// the behavior is the same if assuming GFX12.0 in CU mode.
- assert(ST.hasGFX1250Insts() ? ST.isCuModeEnabled() : true);
+ assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
}
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
@@ -2592,14 +2592,15 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
bool Changed = false;
- // GFX12.5 only: xcnt wait is needed before flat and global atomics stores/rmw
+ // GFX12.5 only: xcnt wait is needed before flat and global atomics
+ // stores/rmw.
if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
MachineBasicBlock &MBB = *MI.getParent();
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
Changed = true;
}
- // Remaining fixes do not apply to RMWs
+ // Remaining fixes do not apply to RMWs.
if (IsRMW)
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 8012e9e6bc9bc..f00d32c4719e0 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1658,7 +1658,7 @@ let OtherPredicates = [HasImageInsts] in {
let SubtargetPredicate = HasWaitXcnt in {
- def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_XCNT_soft : SOPP_Pseudo<"", (ins s16imm:$simm16), "$simm16">;
}
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
More information about the llvm-branch-commits
mailing list