[llvm] [AMDGPU] Mitigate GFX12 VALU read SGPR hazard (PR #100067)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 11 00:49:01 PDT 2024
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/100067
>From eec828fa67ed46b7f3ee06d5c5df34936462d2ff Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 22 Jul 2024 12:39:38 +0900
Subject: [PATCH 1/2] [AMDGPU] Mitigate GFX12 VALU read SGPR hazard
Any SGPR read by a VALU can potentially obscure SALU writes to the
same register. Insert s_wait_alu instructions to mitigate the
hazard on affected paths.
Compute a global cache of SGPRs with any VALU reads and use this to
avoid inserting mitigation for SGPRs never accessed by VALUs.
To avoid excessive search when compile time is priority implement
secondary mode where all SALU writes are mitigated.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 319 ++++++-
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 4 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +
.../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 24 +-
.../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 24 +-
.../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 2 +
.../AMDGPU/atomic_optimizations_buffer.ll | 68 +-
.../atomic_optimizations_global_pointer.ll | 96 +-
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 57 +-
.../atomic_optimizations_struct_buffer.ll | 57 +-
llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 10 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 125 ++-
.../buffer-fat-pointer-atomicrmw-fmax.ll | 167 +++-
.../buffer-fat-pointer-atomicrmw-fmin.ll | 167 +++-
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 104 ++-
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 176 +++-
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 176 +++-
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 232 +++--
llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 18 +-
llvm/test/CodeGen/AMDGPU/fmaximum.ll | 3 +-
llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fminimum.ll | 3 +-
llvm/test/CodeGen/AMDGPU/fminimum3.ll | 4 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 106 ++-
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 176 +++-
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 176 +++-
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 232 +++--
.../test/CodeGen/AMDGPU/global_atomics_i64.ll | 48 +-
.../hazard-recognizer-src-shared-base.ll | 23 +
.../AMDGPU/indirect-call-known-callees.ll | 15 +-
.../insert_waitcnt_for_precise_memory.ll | 43 +-
.../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 6 +
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 44 +-
...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 10 +-
.../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 19 +-
...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 20 +-
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 20 +-
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 20 +-
...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 16 +-
...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 16 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll | 2 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 2 +
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 2 +
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 46 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 37 +-
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 1 +
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 108 ++-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 113 ++-
.../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 80 +-
.../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 80 +-
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 120 ++-
.../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 3 +-
.../lower-work-group-id-intrinsics-hsa.ll | 5 +-
.../lower-work-group-id-intrinsics-pal.ll | 5 +-
.../materialize-frame-index-sgpr.gfx10.ll | 34 +
.../AMDGPU/materialize-frame-index-sgpr.ll | 17 +-
.../AMDGPU/memory-legalizer-flat-agent.ll | 156 ++++
.../AMDGPU/memory-legalizer-flat-lastuse.ll | 8 +
.../memory-legalizer-flat-nontemporal.ll | 26 +
.../memory-legalizer-flat-singlethread.ll | 156 ++++
.../AMDGPU/memory-legalizer-flat-system.ll | 156 ++++
.../AMDGPU/memory-legalizer-flat-volatile.ll | 26 +
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 154 ++++
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 148 +++
.../AMDGPU/memory-legalizer-global-agent.ll | 150 +++
.../AMDGPU/memory-legalizer-global-lastuse.ll | 8 +
.../memory-legalizer-global-nontemporal.ll | 18 +
.../memory-legalizer-global-singlethread.ll | 152 +++
.../AMDGPU/memory-legalizer-global-system.ll | 142 +++
.../memory-legalizer-global-volatile.ll | 20 +
.../memory-legalizer-global-wavefront.ll | 152 +++
.../memory-legalizer-global-workgroup.ll | 152 +++
.../memory-legalizer-invalid-syncscope.ll | 1 +
.../AMDGPU/memory-legalizer-local-agent.ll | 120 +++
.../memory-legalizer-local-nontemporal.ll | 16 +
.../memory-legalizer-local-singlethread.ll | 120 +++
.../AMDGPU/memory-legalizer-local-system.ll | 120 +++
.../AMDGPU/memory-legalizer-local-volatile.ll | 14 +
.../memory-legalizer-local-wavefront.ll | 120 +++
.../memory-legalizer-local-workgroup.ll | 120 +++
.../memory-legalizer-private-lastuse.ll | 6 +
.../memory-legalizer-private-nontemporal.ll | 18 +
.../memory-legalizer-private-volatile.ll | 16 +
.../AMDGPU/pseudo-scalar-transcendental.ll | 37 +-
llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 8 +-
llvm/test/CodeGen/AMDGPU/v_swap_b16.ll | 10 +-
.../CodeGen/AMDGPU/valu-mask-write-hazard.mir | 168 ++--
.../CodeGen/AMDGPU/valu-read-sgpr-hazard.mir | 862 ++++++++++++++++++
.../CodeGen/AMDGPU/vcmpx-permlane-hazard.mir | 5 +-
91 files changed, 5948 insertions(+), 932 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a8b171aa82840a..b2eabd2117f151 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -14,6 +14,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/ScheduleDAG.h"
@@ -44,6 +45,10 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
cl::desc("Fill a percentage of the latency between "
"neighboring MFMA with s_nops."));
+static cl::opt<unsigned> MaxExhaustiveHazardSearch(
+ "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
+ cl::desc("Maximum function size for exhausive hazard search"));
+
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
//===----------------------------------------------------------------------===//
@@ -51,15 +56,11 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
const GCNSubtarget &ST);
-GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
- IsHazardRecognizerMode(false),
- CurrCycleInstr(nullptr),
- MF(MF),
- ST(MF.getSubtarget<GCNSubtarget>()),
- TII(*ST.getInstrInfo()),
- TRI(TII.getRegisterInfo()),
- ClauseUses(TRI.getNumRegUnits()),
- ClauseDefs(TRI.getNumRegUnits()) {
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
+ : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
+ ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false),
+ ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
TSchedModel.init(&ST);
RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
@@ -1105,6 +1106,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixWMMAHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
+ fixVALUReadSGPRHazard(MI);
fixRequiredExportPriority(MI);
}
@@ -2761,6 +2763,36 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
return false;
}
+// Adjust global offsets for instructions bundled with S_GETPC_B64 after
+// insertion of a new instruction.
+static void updateGetPCBundle(MachineInstr *NewMI) {
+ if (!NewMI->isBundled())
+ return;
+
+ // Find start of bundle.
+ auto I = NewMI->getIterator();
+ while (I->isBundledWithPred())
+ I--;
+ if (I->isBundle())
+ I++;
+
+ // Bail if this is not an S_GETPC bundle.
+ if (I->getOpcode() != AMDGPU::S_GETPC_B64)
+ return;
+
+ // Update offsets of any references in the bundle.
+ const unsigned NewBytes = NewMI->getDesc().getSize();
+ auto NextMI = std::next(NewMI->getIterator());
+ auto End = NewMI->getParent()->end();
+ while (NextMI != End && NextMI->isBundledWithPred()) {
+ for (auto &Operand : NextMI->operands()) {
+ if (Operand.isGlobal())
+ Operand.setOffset(Operand.getOffset() + NewBytes);
+ }
+ NextMI++;
+ }
+}
+
bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
if (!ST.hasVALUMaskWriteHazard())
return false;
@@ -2878,22 +2910,269 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
auto NextMI = std::next(MI->getIterator());
// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
- BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
- TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
// SALU write may be s_getpc in a bundle.
- if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
- // Update offsets of any references in the bundle.
- while (NextMI != MI->getParent()->end() &&
- NextMI->isBundledWithPred()) {
- for (auto &Operand : NextMI->operands()) {
- if (Operand.isGlobal())
- Operand.setOffset(Operand.getOffset() + 4);
+ updateGetPCBundle(NewMI);
+
+ return true;
+}
+
+static unsigned baseSGPRNumber(Register Reg, const SIRegisterInfo &TRI) {
+ unsigned RegN = TRI.getEncodingValue(Reg);
+ assert(RegN <= 127);
+ return (RegN >> 1) & 0x3f;
+}
+
+// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
+void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
+ assert(MMF == &MF);
+
+ // Assume non-empty vector means it has already been computed.
+ if (!VALUReadHazardSGPRs.empty())
+ return;
+
+ auto CallingConv = MF.getFunction().getCallingConv();
+ bool IsCallFree =
+ AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
+
+ // Exhaustive search is only viable in non-caller/callee functions where
+ // VALUs will be exposed to the hazard recognizer.
+ UseVALUReadHazardExhaustiveSearch =
+ IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
+ MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
+
+ // Consider all SGPRs hazards if the shader uses function calls or is callee.
+ bool UseVALUUseCache =
+ IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
+ VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
+ if (!UseVALUUseCache)
+ return;
+
+ // Perform a post ordered reverse scan to find VALUs which read an SGPR
+ // before a SALU write to the same SGPR. This provides a reduction in
+ // hazard insertion when all VALU access to an SGPR occurs after its last
+ // SALU write, when compared to a linear scan.
+ const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
+ MachineCycleInfo CI;
+ CI.compute(*MMF);
+
+ for (auto *MBB : post_order(&MF)) {
+ bool InCycle = CI.getCycle(MBB) != nullptr;
+ for (auto &MI : reverse(MBB->instrs())) {
+ bool IsVALU = SIInstrInfo::isVALU(MI);
+ bool IsSALU = SIInstrInfo::isSALU(MI);
+ if (!(IsVALU || IsSALU))
+ continue;
+
+ for (const MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+ Register Reg = Op.getReg();
+ // Only consider implicit operands of VCC.
+ if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
+ Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
+ continue;
+ if (!TRI.isSGPRReg(MRI, Reg))
+ continue;
+ if (TRI.getEncodingValue(Reg) >= SGPR_NULL)
+ continue;
+ unsigned RegN = baseSGPRNumber(Reg, TRI);
+ if (IsVALU && Op.isUse()) {
+ // Note: any access within a cycle must be considered a hazard.
+ if (InCycle || (ReadSGPRs[RegN] && SALUWriteSGPRs[RegN]))
+ VALUReadHazardSGPRs.set(RegN);
+ ReadSGPRs.set(RegN);
+ } else if (IsSALU) {
+ if (Op.isDef())
+ SALUWriteSGPRs.set(RegN);
+ else
+ ReadSGPRs.set(RegN);
+ }
}
- NextMI++;
}
}
+}
+
+bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
+ if (!ST.hasVALUReadSGPRHazard())
+ return false;
+
+ // The hazard sequence is fundamentally three instructions:
+ // 1. VALU reads SGPR
+ // 2. SALU writes SGPR
+ // 3. VALU/SALU reads SGPR
+ // Try to avoid searching for (1) because the expiry point of the hazard is
+ // indeterminate; however, the hazard between (2) and (3) can expire if the
+ // gap contains sufficient SALU instructions with no usage of SGPR from (1).
+ // Note: SGPRs must be considered as 64-bit pairs as hazard exists
+ // even if individual SGPRs are accessed.
+
+ bool MIIsSALU = SIInstrInfo::isSALU(*MI);
+ bool MIIsVALU = SIInstrInfo::isVALU(*MI);
+ if (!(MIIsSALU || MIIsVALU))
+ return false;
+
+ // Avoid expensive search when compile time is priority by
+ // mitigating every SALU which writes an SGPR.
+ if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
+ if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
+ return false;
+
+ const MachineOperand *SDSTOp =
+ TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
+ if (!SDSTOp || !SDSTOp->isReg())
+ return false;
+
+ const Register HazardReg = SDSTOp->getReg();
+ if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
+ HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
+ return false;
+
+ // Add s_wait_alu sa_sdst(0) after SALU write.
+ auto NextMI = std::next(MI->getIterator());
+ auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+
+ // SALU write may be s_getpc in a bundle.
+ updateGetPCBundle(NewMI);
+
+ return true;
+ }
+
+ // Pre-compute set of SGPR pairs read by VALUs.
+ // Note: pass mutable pointer to MachineFunction for CycleInfo.
+ computeVALUHazardSGPRs(MI->getMF());
+
+ // If no VALUs hazard SGPRs exist then nothing to do.
+ if (VALUReadHazardSGPRs.none())
+ return false;
+
+ // All SGPR writes before a call/return must be flushed as the callee/caller
+ // will not will not see the hazard chain, i.e. (2) to (3) described above.
+ const bool IsSetPC = (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
+ MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
+ MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
+ MI->getOpcode() == AMDGPU::S_CALL_B64);
+
+ // Collect all SGPR sources for MI which are read by a VALU.
+ const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallSet<Register, 4> SGPRsUsed;
+
+ if (!IsSetPC) {
+ for (const MachineOperand &Op : MI->all_uses()) {
+ Register OpReg = Op.getReg();
+
+ // Only consider VCC implicit uses on VALUs.
+ // The only expected SALU implicit access is SCC which is no hazard.
+ if (MIIsSALU && Op.isImplicit())
+ continue;
+
+ if (!TRI.isSGPRReg(MRI, OpReg))
+ continue;
+
+ // Ignore special purposes registers such as NULL, EXEC, and M0.
+ if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
+ continue;
+
+ unsigned RegN = baseSGPRNumber(OpReg, TRI);
+ if (!VALUReadHazardSGPRs[RegN])
+ continue;
+
+ SGPRsUsed.insert(OpReg);
+ }
+
+ // No SGPRs -> nothing to do.
+ if (SGPRsUsed.empty())
+ return false;
+ }
+
+ // A hazard is any SALU which writes one of the SGPRs read by MI.
+ auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
+ if (!SIInstrInfo::isSALU(I))
+ return false;
+ // Ensure SGPR flush before call/return by conservatively assuming every
+ // SALU writes an SGPR.
+ if (IsSetPC && I.getNumDefs() > 0)
+ return true;
+ // Check for any register writes.
+ return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
+ return I.modifiesRegister(Reg, &TRI);
+ });
+ };
+
+ const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
+ auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
+ if (Count >= SALUExpiryCount)
+ return true;
+ // s_wait_alu sa_sdst(0) on path mitigates hazard.
+ if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
+ return true;
+ return false;
+ };
+
+ auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
+ // Only count true SALUs as wait states.
+ if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
+ return 0;
+ // SALU must be unrelated to any hazard registers.
+ if (llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
+ return I.readsRegister(Reg, &TRI);
+ }))
+ return 0;
+ return 1;
+ };
+
+ // Check for the hazard.
+ DenseSet<const MachineBasicBlock *> Visited;
+ int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, WaitStatesFn);
+
+ if (WaitStates >= SALUExpiryCount)
+ return false;
+
+ // Validate hazard through an exhaustive search.
+ if (UseVALUReadHazardExhaustiveSearch) {
+ // A hazard is any VALU which reads one of the paired SGPRs read by MI.
+ // This is searching for (1) in the hazard description.
+ auto hazardPair = [this](Register Reg) {
+ if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
+ return Register(AMDGPU::VCC);
+ // TODO: handle TTMP?
+ return Register(AMDGPU::SGPR0_SGPR1 + baseSGPRNumber(Reg, TRI));
+ };
+ auto SearchHazardFn = [this, hazardPair,
+ &SGPRsUsed](const MachineInstr &I) {
+ if (!SIInstrInfo::isVALU(I))
+ return false;
+ // Check for any register reads.
+ return llvm::any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
+ return I.readsRegister(hazardPair(Reg), &TRI);
+ });
+ };
+ auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
+ return false;
+ };
+ if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+ }
+
+ // Add s_wait_alu sa_sdst(0) before SALU read.
+ auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+
+ // SALU read may be after s_getpc in a bundle.
+ updateGetPCBundle(NewMI);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index f2a64ab48e180c..e840e2445188fb 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -48,6 +48,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const SIRegisterInfo &TRI;
TargetSchedModel TSchedModel;
bool RunLdsBranchVmemWARHazardFixup;
+ BitVector VALUReadHazardSGPRs;
+ bool UseVALUReadHazardExhaustiveSearch;
/// RegUnits of uses in the current soft memory clause.
BitVector ClauseUses;
@@ -107,6 +109,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixWMMAHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
+ void computeVALUHazardSGPRs(MachineFunction *MMF);
+ bool fixVALUReadSGPRHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index def89c785b8552..0e956406e0bfb4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1246,6 +1246,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
+ bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
+
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts; }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 61d2c854dffa55..df81b926bceb39 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -623,13 +623,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -779,12 +781,14 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1211,13 +1215,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1365,12 +1371,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1868,12 +1876,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2059,12 +2069,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 83be67a9138f6e..53d9bf0751a1d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -623,13 +623,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -779,12 +781,14 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1211,13 +1215,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1365,12 +1371,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1868,12 +1876,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2059,12 +2069,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index f4fd803c8dda89..a8579f26830351 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1414,6 +1414,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_mov_b32 s2, 3
; GFX12-NEXT: s_mov_b32 s1, 2
; GFX12-NEXT: s_mov_b32 s0, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1504,6 +1505,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_mov_b32 s2, 3
; GFX12-NEXT: s_mov_b32 s1, 2
; GFX12-NEXT: s_mov_b32 s0, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 17fe3adc221692..5cf9c9faa693ed 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -226,6 +226,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -233,8 +234,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -263,13 +265,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB0_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -509,6 +514,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -517,8 +523,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
@@ -547,10 +554,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB1_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
@@ -559,6 +568,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
@@ -878,7 +888,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -899,6 +909,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_cbranch_execz .LBB2_4
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -924,22 +935,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB2_4
; GFX12W32-NEXT: ; %bb.3:
@@ -948,6 +962,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1289,7 +1304,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -1312,6 +1327,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_clause 0x1
; GFX12W64-NEXT: s_load_b32 s5, s[2:3], 0x44
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mov_b32_e32 v2, s5
@@ -1338,22 +1354,25 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB3_4
; GFX12W32-NEXT: ; %bb.3:
@@ -1364,6 +1383,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB3_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1719,6 +1739,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1726,8 +1747,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -1757,13 +1779,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB5_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -2006,6 +2031,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -2014,8 +2040,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
@@ -2045,10 +2072,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB6_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
@@ -2378,7 +2407,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -2399,6 +2428,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_cbranch_execz .LBB7_4
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -2424,22 +2454,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB7_4
; GFX12W32-NEXT: ; %bb.3:
@@ -2448,6 +2481,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 16f3ff4be6b501..ed036a83b6143b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -266,6 +266,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -273,8 +274,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_mul_i32 s6, s6, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v1, s6
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
@@ -307,10 +310,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB0_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_mul_i32 s5, s5, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mov_b32_e32 v1, s5
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
@@ -320,6 +326,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB0_2:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
@@ -597,6 +604,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -640,6 +648,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB1_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
@@ -1012,7 +1021,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
@@ -1033,6 +1042,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
@@ -1064,23 +1074,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6
; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
; GFX1232_ITERATIVE-NEXT: ; %bb.3:
@@ -1095,6 +1108,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: .LBB2_4:
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
@@ -1520,6 +1534,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1527,12 +1542,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1264_DPP-NEXT: s_mov_b32 s4, s9
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
@@ -1550,6 +1567,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1264_DPP-NEXT: .LBB2_2:
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0
@@ -1595,8 +1613,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
@@ -1604,6 +1622,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1232_DPP-NEXT: ; %bb.1:
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
@@ -1900,6 +1919,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_mov_b32 s9, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1910,6 +1930,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
; GFX1264-NEXT: v_mov_b32_e32 v1, s7
; GFX1264-NEXT: s_wait_kmcnt 0x0
@@ -1945,10 +1966,13 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
@@ -2288,6 +2312,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2298,6 +2323,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
; GFX1264-NEXT: s_mov_b32 s10, -1
@@ -2336,6 +2362,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB4_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
@@ -2349,6 +2376,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB4_2:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
@@ -2763,7 +2791,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
@@ -2785,6 +2813,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
@@ -2819,8 +2848,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
@@ -2828,6 +2858,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3594,6 +3625,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63
; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32
; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -3601,6 +3633,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48
; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9]
; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec
@@ -3619,6 +3652,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1264_DPP-NEXT: .LBB5_2:
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8
@@ -3698,6 +3732,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16
; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo
@@ -3715,6 +3750,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1232_DPP-NEXT: .LBB5_2:
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8
@@ -3989,6 +4025,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -3996,8 +4033,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_mul_i32 s6, s6, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v1, s6
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
@@ -4031,10 +4070,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB6_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_mul_i32 s5, s5, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mov_b32_e32 v1, s5
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
@@ -4044,6 +4086,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB6_2:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
@@ -4326,6 +4369,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -4370,6 +4414,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB7_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
@@ -4743,7 +4788,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
@@ -4764,6 +4809,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
@@ -4795,23 +4841,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6
; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
; GFX1232_ITERATIVE-NEXT: ; %bb.3:
@@ -4826,6 +4875,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: .LBB8_4:
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
@@ -5251,6 +5301,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5258,12 +5309,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1264_DPP-NEXT: s_mov_b32 s4, s9
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
@@ -5281,6 +5334,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1264_DPP-NEXT: .LBB8_2:
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0
@@ -5326,8 +5380,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
@@ -5335,6 +5389,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2
; GFX1232_DPP-NEXT: ; %bb.1:
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
@@ -5646,6 +5701,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_mov_b32 s9, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5656,6 +5712,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
; GFX1264-NEXT: v_mov_b32_e32 v1, s7
; GFX1264-NEXT: s_wait_kmcnt 0x0
@@ -5694,10 +5751,13 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
@@ -6053,6 +6113,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6063,6 +6124,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
; GFX1264-NEXT: s_mov_b32 s10, -1
@@ -6105,6 +6167,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB10_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
@@ -6118,6 +6181,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB10_2:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
@@ -6536,7 +6600,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
@@ -6558,6 +6622,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
@@ -6592,8 +6657,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
@@ -6601,6 +6667,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -7367,6 +7434,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63
; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32
; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7374,6 +7442,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48
; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9]
; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec
@@ -7392,6 +7461,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1264_DPP-NEXT: .LBB11_2:
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8
@@ -7471,6 +7541,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16
; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo
@@ -7488,6 +7559,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1232_DPP-NEXT: .LBB11_2:
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 3a2efadac067d0..9d4dfd8911257a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -225,6 +225,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -232,8 +233,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -262,13 +264,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB0_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -508,6 +513,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -516,8 +522,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
@@ -546,10 +553,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB1_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
@@ -558,6 +567,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
@@ -877,7 +887,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -898,6 +908,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_cbranch_execz .LBB2_4
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -923,22 +934,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB2_4
; GFX12W32-NEXT: ; %bb.3:
@@ -947,6 +961,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1302,6 +1317,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1309,8 +1325,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -1340,13 +1357,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB4_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB4_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1589,6 +1609,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1597,8 +1618,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
@@ -1628,10 +1650,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB5_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
@@ -1961,7 +1985,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -1982,6 +2006,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_cbranch_execz .LBB6_4
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -2007,22 +2032,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB6_4
; GFX12W32-NEXT: ; %bb.3:
@@ -2031,6 +2059,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index d0c0b62c78e42b..3fb44e090c61f2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -232,6 +232,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -240,8 +241,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -270,13 +272,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB0_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -523,6 +528,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -532,8 +538,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
@@ -562,10 +569,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB1_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
@@ -574,6 +583,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
@@ -899,7 +909,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -921,6 +931,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -946,22 +957,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB2_4
; GFX12W32-NEXT: ; %bb.3:
@@ -970,6 +984,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1466,6 +1481,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1474,8 +1490,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -1505,13 +1522,16 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB5_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1761,6 +1781,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1770,8 +1791,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
@@ -1801,10 +1823,12 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB6_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
@@ -2140,7 +2164,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -2162,6 +2186,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -2187,22 +2212,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB7_4
; GFX12W32-NEXT: ; %bb.3:
@@ -2211,6 +2239,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 4f0bc512565d13..1b0e77bfd8f2db 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -112,7 +112,7 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SYS
; GFX1200-NEXT: s_setpc_b64 s[30:31]
- %res = atomicrmw fadd ptr %addr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ %res = atomicrmw fadd ptr %addr, float %val seq_cst
ret float %res
}
@@ -216,7 +216,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SE
; GFX1200-NEXT: s_setpc_b64 s[30:31]
- %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret float %res
}
@@ -351,7 +351,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SE
; GFX1200-NEXT: s_setpc_b64 s[30:31]
- %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret void
}
@@ -447,6 +447,4 @@ define float @no_unsafe(ptr %addr, float %val) {
ret float %res
}
-attributes #0 = { nounwind }
-
-!0 = !{}
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 9d9e6898417e87..07127bfe638317 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -436,13 +436,16 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2:
@@ -450,6 +453,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -2049,7 +2053,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2067,12 +2071,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -2323,7 +2329,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
@@ -2340,12 +2346,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -2596,11 +2604,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2:
@@ -2625,11 +2636,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2639,11 +2653,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB10_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -3113,7 +3128,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -3131,12 +3146,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
@@ -3407,7 +3424,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -3425,12 +3442,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -3684,14 +3703,17 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v5, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
@@ -3704,6 +3726,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
@@ -3713,12 +3736,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -4054,14 +4078,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
@@ -4074,6 +4101,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
@@ -4083,11 +4111,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -4429,10 +4458,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2:
@@ -4462,11 +4494,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
@@ -4476,12 +4511,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_mov_b32_e32 v7, v8
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB15_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -5054,14 +5090,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
@@ -5082,6 +5121,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -5091,12 +5131,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5479,14 +5520,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
@@ -5507,6 +5551,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -5516,11 +5561,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5908,10 +5954,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
@@ -5951,11 +6000,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
@@ -5965,12 +6017,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB18_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -7124,13 +7177,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2:
@@ -7138,6 +7194,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -9582,13 +9639,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2:
@@ -9596,6 +9656,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 172ce4c065e13d..f11a651fe409c1 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -428,13 +428,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2:
@@ -442,6 +445,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -1297,7 +1301,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -1317,12 +1321,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -1514,7 +1520,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -1533,12 +1539,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -1734,11 +1742,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2:
@@ -1766,11 +1777,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1780,11 +1794,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB7_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -2149,7 +2164,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2169,12 +2184,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
@@ -2460,7 +2477,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2480,12 +2497,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -2680,14 +2699,17 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -2700,8 +2722,9 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -2711,12 +2734,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3066,14 +3090,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -3086,8 +3113,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -3097,11 +3125,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3456,10 +3485,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2:
@@ -3493,11 +3525,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
@@ -3507,12 +3542,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB12_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4098,14 +4134,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
@@ -4126,6 +4165,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -4135,12 +4175,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4525,14 +4566,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
@@ -4553,6 +4597,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -4562,11 +4607,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4956,10 +5002,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2:
@@ -4999,11 +5048,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
@@ -5013,12 +5065,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB15_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -5640,8 +5693,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -5659,12 +5713,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -5984,6 +6040,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: v_mov_b32_e32 v1, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
@@ -6001,12 +6058,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -6326,11 +6385,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
@@ -6358,11 +6420,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
@@ -6372,12 +6437,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB18_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -6933,8 +6999,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -6968,12 +7034,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
@@ -7378,7 +7446,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s5, 0
@@ -7410,12 +7478,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
@@ -7819,11 +7889,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2:
@@ -7866,11 +7939,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7880,12 +7956,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB21_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 61ee956747135f..72d4bbb3c2a2a7 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -428,13 +428,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2:
@@ -442,6 +445,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -1297,7 +1301,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -1317,12 +1321,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -1514,7 +1520,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -1533,12 +1539,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
@@ -1734,11 +1742,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2:
@@ -1766,11 +1777,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1780,11 +1794,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB7_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -2149,7 +2164,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2169,12 +2184,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
@@ -2460,7 +2477,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -2480,12 +2497,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -2680,14 +2699,17 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -2700,8 +2722,9 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -2711,12 +2734,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3066,14 +3090,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -3086,8 +3113,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -3097,11 +3125,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3456,10 +3485,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2:
@@ -3493,11 +3525,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
@@ -3507,12 +3542,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB12_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -4098,14 +4134,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
@@ -4126,6 +4165,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -4135,12 +4175,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4525,14 +4566,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
@@ -4553,6 +4597,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -4562,11 +4607,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4956,10 +5002,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2:
@@ -4999,11 +5048,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
@@ -5013,12 +5065,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB15_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -5640,8 +5693,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -5659,12 +5713,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -5984,6 +6040,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: v_mov_b32_e32 v1, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
@@ -6001,12 +6058,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -6326,11 +6385,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2:
@@ -6358,11 +6420,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
@@ -6372,12 +6437,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB18_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -6933,8 +6999,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -6968,12 +7034,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
@@ -7378,7 +7446,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s5, 0
@@ -7410,12 +7478,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
@@ -7819,11 +7889,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2:
@@ -7866,11 +7939,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7880,12 +7956,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB21_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 422c8a0be23b49..1ae1204e3cde18 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -5707,13 +5707,15 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
@@ -5888,13 +5890,15 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6070,13 +6074,15 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6263,12 +6269,14 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
@@ -6432,12 +6440,14 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6608,12 +6618,14 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6816,13 +6828,15 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
@@ -7105,13 +7119,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7403,13 +7419,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7700,12 +7718,14 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
@@ -7978,12 +7998,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8265,12 +8287,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8541,12 +8565,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -8756,13 +8782,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -8990,13 +9018,15 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9289,12 +9319,14 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9590,13 +9622,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -9933,13 +9967,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10286,13 +10322,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10639,12 +10677,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10981,12 +11021,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11313,13 +11355,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -11596,12 +11640,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -11882,12 +11928,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -12215,13 +12263,15 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12569,12 +12619,14 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index ea2427a3c420f9..ed78f4a071e3d0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -2800,13 +2800,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2954,13 +2956,15 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3113,13 +3117,15 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3286,12 +3292,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3435,12 +3443,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3591,12 +3601,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3761,13 +3773,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -3971,13 +3985,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4143,13 +4159,15 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4446,13 +4464,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -4758,13 +4778,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5070,12 +5092,14 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5364,12 +5388,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5667,12 +5693,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5959,13 +5987,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6195,12 +6225,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -6437,13 +6469,15 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6751,12 +6785,14 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7065,13 +7101,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -7409,13 +7447,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7763,13 +7803,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8115,12 +8157,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -8448,12 +8492,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8791,12 +8837,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9124,13 +9172,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9408,12 +9458,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -9698,13 +9750,15 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10053,12 +10107,14 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10381,13 +10437,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -10616,13 +10674,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10854,13 +10914,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11110,12 +11172,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11336,12 +11400,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11569,12 +11635,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11820,13 +11888,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12060,12 +12130,14 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12317,13 +12389,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -12662,13 +12736,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13010,13 +13086,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13375,12 +13453,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -13709,12 +13789,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB58_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14050,12 +14132,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB59_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14410,13 +14494,15 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB60_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14759,12 +14845,14 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB61_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 2767b66e447030..bdb945a652eb21 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -2800,13 +2800,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2954,13 +2956,15 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3113,13 +3117,15 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3286,12 +3292,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3435,12 +3443,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3591,12 +3601,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3761,13 +3773,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -3971,13 +3985,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4143,13 +4159,15 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4446,13 +4464,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -4758,13 +4778,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5070,12 +5092,14 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5364,12 +5388,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5667,12 +5693,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5959,13 +5987,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6195,12 +6225,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -6437,13 +6469,15 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6751,12 +6785,14 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7065,13 +7101,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -7409,13 +7447,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7763,13 +7803,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8115,12 +8157,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -8448,12 +8492,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8791,12 +8837,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9124,13 +9172,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9408,12 +9458,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -9698,13 +9750,15 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10053,12 +10107,14 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10381,13 +10437,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -10616,13 +10674,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10854,13 +10914,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11110,12 +11172,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11336,12 +11400,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11569,12 +11635,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11820,13 +11888,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12060,12 +12130,14 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12317,13 +12389,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -12662,13 +12736,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13010,13 +13086,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13375,12 +13453,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -13709,12 +13789,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB58_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14050,12 +14132,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB59_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14410,13 +14494,15 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB60_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14759,12 +14845,14 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB61_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 6672f16c4a7a8d..c7f2bf6d1b317f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -34,13 +34,15 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32:
@@ -229,13 +231,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
@@ -428,13 +432,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
@@ -644,12 +650,14 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32:
@@ -829,12 +837,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
@@ -1021,12 +1031,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
@@ -1232,13 +1244,15 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
@@ -1432,12 +1446,14 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
@@ -1631,13 +1647,15 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
@@ -1826,13 +1844,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -2025,13 +2045,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
@@ -2241,12 +2263,14 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
@@ -2426,12 +2450,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -2618,12 +2644,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
@@ -2829,13 +2857,15 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -3029,12 +3059,14 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3228,13 +3260,15 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64:
@@ -3439,13 +3473,15 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
@@ -3651,13 +3687,15 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
@@ -3877,12 +3915,14 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64:
@@ -4072,12 +4112,14 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
@@ -4274,12 +4316,14 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
@@ -4513,13 +4557,15 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16:
@@ -4802,13 +4848,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
@@ -5100,13 +5148,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
@@ -5397,12 +5447,14 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16:
@@ -5675,12 +5727,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
@@ -5962,12 +6016,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
@@ -6239,13 +6295,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -6460,12 +6518,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
@@ -6687,13 +6747,15 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
@@ -6986,12 +7048,14 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
@@ -7287,13 +7351,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16:
@@ -7630,13 +7696,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -7983,13 +8051,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -8334,12 +8404,14 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16:
@@ -8666,12 +8738,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
@@ -9008,12 +9082,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
@@ -9340,13 +9416,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -9623,12 +9701,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
@@ -9912,13 +9992,15 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -10266,12 +10348,14 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
@@ -10591,13 +10675,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16:
@@ -10809,13 +10895,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -11030,13 +11118,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
@@ -11268,12 +11358,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16:
@@ -11475,12 +11567,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -11689,12 +11783,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
@@ -11922,13 +12018,15 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -12144,12 +12242,14 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -12385,13 +12485,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
@@ -12730,13 +12832,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -13078,13 +13182,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
@@ -13443,12 +13549,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
@@ -13777,12 +13885,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
@@ -14118,12 +14228,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
@@ -14478,13 +14590,15 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -14827,12 +14941,14 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 14d8b71c5167a2..dc452823416f87 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -265,10 +265,11 @@ define void @zero_init_foo() {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_clause 0x3
@@ -354,10 +355,11 @@ define void @zero_init_foo() {
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
; GFX12-PAL-NEXT: s_mov_b32 s0, 0
-; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_mov_b32 s1, s0
; GFX12-PAL-NEXT: s_mov_b32 s2, s0
; GFX12-PAL-NEXT: s_mov_b32 s3, s0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-PAL-NEXT: s_clause 0x3
@@ -1351,10 +1353,11 @@ define void @zero_init_small_offset_foo() {
; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_clause 0x3
@@ -1450,10 +1453,11 @@ define void @zero_init_small_offset_foo() {
; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
; GFX12-PAL-NEXT: s_mov_b32 s0, 0
-; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_mov_b32 s1, s0
; GFX12-PAL-NEXT: s_mov_b32 s2, s0
; GFX12-PAL-NEXT: s_mov_b32 s3, s0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-PAL-NEXT: s_clause 0x3
@@ -2562,10 +2566,11 @@ define void @zero_init_large_offset_foo() {
; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_clause 0x3
@@ -2702,10 +2707,11 @@ define void @zero_init_large_offset_foo() {
; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
; GFX12-PAL-NEXT: s_mov_b32 s0, 0
-; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_mov_b32 s1, s0
; GFX12-PAL-NEXT: s_mov_b32 s2, s0
; GFX12-PAL-NEXT: s_mov_b32 s3, s0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-PAL-NEXT: s_clause 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 3b7009023b03af..04cd150d93176f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -174,7 +174,8 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, s0, s2
; GFX12-GISEL-NEXT: s_maximum_f16 s0, s1, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 9ce1ba3316dd5b..27282a453075b3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -462,7 +462,7 @@ define float @v_fmaximum3_f32_const1_const2(float %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0x41000000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_maximum3_f32 v0, v0, s0, 0x41800000
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -1409,7 +1409,7 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_movk_i32 s0, 0x4800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_maximum3_f16 v0, v0, s0, 0x4c00
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 817e6dd87361ff..3271758f712971 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -174,7 +174,8 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, s0, s2
; GFX12-GISEL-NEXT: s_minimum_f16 s0, s1, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 21074d58bdb7e2..d9ba2de48bb010 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -462,7 +462,7 @@ define float @v_fminimum3_f32_const1_const2(float %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0x41000000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_minimum3_f32 v0, v0, s0, 0x41800000
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -1409,7 +1409,7 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_movk_i32 s0, 0x4800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_minimum3_f16 v0, v0, s0, 0x4c00
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 064238c63717ec..361cc1e9e6c1db 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -6963,13 +6963,15 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
@@ -7184,13 +7186,15 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7406,13 +7410,15 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7635,12 +7641,14 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
@@ -7838,12 +7846,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8044,12 +8054,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8277,13 +8289,15 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
@@ -8616,13 +8630,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8966,13 +8982,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9315,12 +9333,14 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
@@ -9642,12 +9662,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9979,12 +10001,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10306,13 +10330,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -10566,12 +10592,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -10830,13 +10858,15 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11181,12 +11211,14 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11532,13 +11564,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -11925,13 +11959,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12330,13 +12366,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -12733,12 +12771,14 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -13114,12 +13154,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB58_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13506,12 +13548,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB59_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13888,13 +13932,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB60_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -14210,12 +14256,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB61_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -14536,13 +14584,15 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB62_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14942,12 +14992,14 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB63_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -23072,8 +23124,10 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX12-NEXT: s_cbranch_execz .LBB92_2
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0
; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 06d971febd0380..84003a0432f7ef 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -3020,13 +3020,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3192,13 +3194,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3365,13 +3369,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3538,12 +3544,14 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3700,12 +3708,14 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3865,12 +3875,14 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -4030,13 +4042,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -4278,13 +4292,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4468,13 +4484,15 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4821,13 +4839,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5185,13 +5205,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5549,12 +5571,14 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5892,12 +5916,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6245,12 +6271,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6587,13 +6615,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6862,12 +6892,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -7141,13 +7173,15 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7507,12 +7541,14 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7871,13 +7907,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -8266,13 +8304,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8673,13 +8713,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9078,12 +9120,14 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -9461,12 +9505,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9855,12 +9901,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10239,13 +10287,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -10563,12 +10613,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -10891,13 +10943,15 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11299,12 +11353,14 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11678,13 +11734,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11970,13 +12028,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12264,13 +12324,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -12562,12 +12624,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -12843,12 +12907,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13127,12 +13193,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13419,13 +13487,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13715,12 +13785,14 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14023,13 +14095,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -14421,13 +14495,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14821,13 +14897,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -15224,12 +15302,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -15609,12 +15689,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB58_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15997,12 +16079,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB59_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16394,13 +16478,15 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB60_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16795,12 +16881,14 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB61_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 65df8f07fb8b3b..2aad91cd1071fc 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -3020,13 +3020,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3192,13 +3194,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3365,13 +3369,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3538,12 +3544,14 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3700,12 +3708,14 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3865,12 +3875,14 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -4030,13 +4042,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -4278,13 +4292,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4468,13 +4484,15 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4821,13 +4839,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5185,13 +5205,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5549,12 +5571,14 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5892,12 +5916,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6245,12 +6271,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6587,13 +6615,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6862,12 +6892,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -7141,13 +7173,15 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7507,12 +7541,14 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7871,13 +7907,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -8266,13 +8304,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8673,13 +8713,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9078,12 +9120,14 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -9461,12 +9505,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9855,12 +9901,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10239,13 +10287,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -10563,12 +10613,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -10891,13 +10943,15 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11299,12 +11353,14 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11678,13 +11734,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11970,13 +12028,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12264,13 +12324,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -12562,12 +12624,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -12843,12 +12907,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13127,12 +13193,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13419,13 +13487,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13715,12 +13785,14 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14023,13 +14095,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -14421,13 +14495,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14821,13 +14897,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -15224,12 +15302,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -15609,12 +15689,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB58_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15997,12 +16079,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB59_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16394,13 +16478,15 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB60_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16795,12 +16881,14 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB61_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 79aa69771f84bd..2e3799e1714afe 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -35,13 +35,15 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32:
@@ -266,13 +268,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos:
@@ -499,13 +503,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg:
@@ -741,12 +747,14 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32:
@@ -961,12 +969,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos:
@@ -1184,12 +1194,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg:
@@ -1416,13 +1428,15 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos:
@@ -1650,12 +1664,14 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos:
@@ -1880,13 +1896,15 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__ftz:
@@ -2111,13 +2129,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -2344,13 +2364,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
@@ -2586,12 +2608,14 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__ftz:
@@ -2806,12 +2830,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3029,12 +3055,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
@@ -3261,13 +3289,15 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -3495,12 +3525,14 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3725,13 +3757,15 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f64:
@@ -3976,13 +4010,15 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos:
@@ -4228,13 +4264,15 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg:
@@ -4487,12 +4525,14 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f64:
@@ -4716,12 +4756,14 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos:
@@ -4948,12 +4990,14 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg:
@@ -5207,13 +5251,15 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f16:
@@ -5546,13 +5592,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
@@ -5896,13 +5944,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
@@ -6245,12 +6295,14 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f16:
@@ -6572,12 +6624,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
@@ -6909,12 +6963,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
@@ -7236,13 +7292,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -7496,12 +7554,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
@@ -7760,13 +7820,15 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
@@ -8111,12 +8173,14 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
@@ -8462,13 +8526,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16:
@@ -8855,13 +8921,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -9260,13 +9328,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -9663,12 +9733,14 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16:
@@ -10044,12 +10116,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
@@ -10436,12 +10510,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
@@ -10818,13 +10894,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -11140,12 +11218,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
@@ -11466,13 +11546,15 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -11872,12 +11954,14 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
@@ -12247,13 +12331,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16:
@@ -12522,13 +12608,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -12799,13 +12887,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg:
@@ -13079,12 +13169,14 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16:
@@ -13341,12 +13433,14 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -13606,12 +13700,14 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg:
@@ -13880,13 +13976,15 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -14158,12 +14256,14 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -14450,13 +14550,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16:
@@ -14848,13 +14950,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -15248,13 +15352,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
@@ -15651,12 +15757,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16:
@@ -16036,12 +16144,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
@@ -16424,12 +16534,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
@@ -16821,13 +16933,15 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -17222,12 +17336,14 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index d1a371fc4356f8..24fd709514b476 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -280,7 +280,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -571,7 +571,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -864,7 +864,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1155,7 +1155,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1448,7 +1448,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1739,7 +1739,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -2014,7 +2014,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2287,7 +2287,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2562,7 +2562,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2835,7 +2835,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3110,7 +3110,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3383,7 +3383,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3658,7 +3658,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3931,7 +3931,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -4224,7 +4224,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4515,7 +4515,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4920,7 +4920,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5504,7 +5504,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5795,7 +5795,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6102,7 +6102,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 scope:SCOPE_DEV
@@ -6196,7 +6196,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6427,7 +6427,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] scope:SCOPE_DEV
@@ -6518,7 +6518,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
new file mode 100644
index 00000000000000..4aa49f2c9296d7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=gfx1201 %s -o - | FileCheck %s
+
+define amdgpu_kernel void @foo() {
+; CHECK-LABEL: foo:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
+; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
+; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3]
+; CHECK-NEXT: s_endpgm
+entry:
+ br label %bb1
+
+bb0:
+ br label %bb1
+
+bb1:
+ %dst = phi ptr [ null, %bb0 ], [ addrspacecast (ptr addrspace(3) null to ptr), %entry ]
+ store i64 0, ptr %dst, align 16
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index eb4cba35e9946e..44a2c34b06b574 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -44,15 +44,19 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX12-LABEL: indirect_call_known_no_special_inputs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_getpc_b64 s[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s7, s7
-; GFX12-NEXT: s_add_co_u32 s6, s6, snork at gotpcrel32@lo+8
-; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork at gotpcrel32@hi+16
+; GFX12-NEXT: s_add_co_u32 s6, s6, snork at gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork at gotpcrel32@hi+24
; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX12-NEXT: s_mov_b64 s[4:5], 0
; GFX12-NEXT: s_getpc_b64 s[8:9]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s9, s9
-; GFX12-NEXT: s_add_co_u32 s8, s8, wobble at gotpcrel32@lo+8
-; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble at gotpcrel32@hi+16
+; GFX12-NEXT: s_add_co_u32 s8, s8, wobble at gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble at gotpcrel32@hi+24
; GFX12-NEXT: s_load_u8 s12, s[4:5], 0x0
; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX12-NEXT: s_load_b64 s[6:7], s[8:9], 0x0
@@ -61,12 +65,13 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX12-NEXT: s_mov_b32 s32, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s8, 1, s12
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cmp_eq_u32 s8, 1
; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX12-NEXT: s_cselect_b32 s7, s7, s5
; GFX12-NEXT: s_cselect_b32 s6, s6, s4
; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 3b972352e0e450..0045082eedb0a3 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -282,13 +282,15 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
ret i32 %result
@@ -401,15 +403,18 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) {
; GFX12-NEXT: scratch_load_b32 v32, off, s32
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_getpc_b64 s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s1, s1
-; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg at rel32@lo+8
-; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg at rel32@hi+16
+; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg at rel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg at rel32@hi+24
; GFX12-NEXT: scratch_store_b32 off, v32, s32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[0:1]
entry:
%alloca = alloca double, align 8, addrspace(5)
@@ -587,25 +592,34 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cvt_u32_f32 s4, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_i32 s5, s5, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_i32 s4, s4, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_hi_u32 s4, s2, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s5, s4, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sub_co_i32 s2, s2, s5
; GFX12-NEXT: s_add_co_i32 s5, s4, 1
; GFX12-NEXT: s_sub_co_i32 s6, s2, s3
; GFX12-NEXT: s_cmp_ge_u32 s2, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cselect_b32 s4, s5, s4
; GFX12-NEXT: s_cselect_b32 s2, s6, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_i32 s5, s4, 1
; GFX12-NEXT: s_cmp_ge_u32 s2, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cselect_b32 s2, s5, s4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
@@ -789,9 +803,11 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s0, s0, 5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_u32 v0, v1
@@ -1036,15 +1052,18 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s1, s1, 5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB7_2:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -1220,13 +1239,16 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s1, s1, 5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, s1
; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: .LBB8_2:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -1645,4 +1667,3 @@ entry:
%bc = bitcast <2 x i32> %r.1 to <2 x float>
ret <2 x float> %bc
}
-
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
index 9445f1225e0cbe..99f4fbf3599483 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
@@ -64,6 +64,7 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc,
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -83,6 +84,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -100,6 +102,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -169,6 +172,7 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
@@ -188,6 +192,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
@@ -205,6 +210,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index aad74410d14538..c4a86952bc4146 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -587,7 +587,8 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
@@ -664,9 +665,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
@@ -680,9 +681,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
@@ -728,7 +729,8 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
@@ -805,9 +807,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
@@ -821,9 +823,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
@@ -3298,7 +3300,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
@@ -3344,7 +3347,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
@@ -3421,9 +3425,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
@@ -3437,9 +3441,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
@@ -3516,9 +3520,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
@@ -3532,9 +3536,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index 320b0b4508b6a5..8a0602e0472b53 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -81,23 +81,27 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr6
; GFX12-NEXT: ; implicit-def: $vgpr5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 128
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 8bfe996c6a90a3..6e029f7c0a95e5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -210,6 +210,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal m0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var:
@@ -489,6 +490,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst m0
; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: global_load_b32 v1, v[2:3], off
@@ -516,8 +518,9 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
-; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_and_b32 s0, 1, s0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GLOBAL-ISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
; GLOBAL-ISEL-NEXT: global_load_b32 v0, v[0:1], off
@@ -741,6 +744,7 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_barrier_init m0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0:
@@ -752,11 +756,12 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s0, v1
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s1, v0
-; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GLOBAL-ISEL-NEXT: s_lshl_b32 s0, 16, s0
-; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_or_b32 m0, s1, s0
; GLOBAL-ISEL-NEXT: s_barrier_init m0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2)
ret void
@@ -945,6 +950,7 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_barrier_join m0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0:
@@ -1202,6 +1208,7 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_wakeup_barrier m0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0:
@@ -1386,10 +1393,11 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
; GCN-NEXT: s_wait_bvhcnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_get_barrier_state s0, m0
; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -1403,7 +1411,8 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v0, s0
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
%state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
index 78204dfefc80cc..2efade9fcbba17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -50,23 +50,27 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB2_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
@@ -90,22 +94,26 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr0
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB3_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index 10059960030446..d5b5c71cc42a95 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -306,22 +306,26 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr0
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB4_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -444,22 +448,26 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr0
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB5_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 5f6a67e4660209..a312a3cb0a95cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -235,23 +235,27 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB4_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
@@ -344,23 +348,27 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB5_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index bd803c380e90a5..8d1dce76d2cc8b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -554,18 +554,22 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -708,23 +712,27 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr7
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index c9b50eddc94eef..06b1a9cc70513e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -554,18 +554,22 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -708,23 +712,27 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr7
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
index df5533b6295023..5ea89bc5749108 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -49,7 +49,7 @@ define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 5d3a5800bcdd8f..bc0daf95e329c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -515,11 +515,13 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_maximum_f16 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.maximum.f16(half %src0, half %src1)
%cast = bitcast half %op to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index e6655aeab7e9b2..6b61931fc9414b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -485,6 +485,7 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call float @llvm.maximum.f32(float %src0, float %src1)
call void asm sideeffect "; use $0", "s"(float %op)
@@ -888,6 +889,7 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:1]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
call void asm sideeffect "; use $0", "s"(<2 x float> %op)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 01effc24e741d1..77b5682a2dbd17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -424,11 +424,13 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_minimum_f16 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.minimum.f16(half %src0, half %src1)
%cast = bitcast half %op to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 518fc27c23082b..8753dc50c4da40 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -485,6 +485,7 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call float @llvm.minimum.f32(float %src0, float %src1)
call void asm sideeffect "; use $0", "s"(float %op)
@@ -888,6 +889,7 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:1]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
call void asm sideeffect "; use $0", "s"(<2 x float> %op)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 7178eaf2e73846..0221f9992ad43e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -2360,6 +2360,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2
; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2
; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3
; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
@@ -2397,6 +2398,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015
; GFX12-NEXT: v_and_b32_e32 v22, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
; GFX12-NEXT: v_and_b32_e32 v4, 1, v5
@@ -2794,6 +2796,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000
; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013
; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3
; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3
; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3
@@ -2807,7 +2810,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016
; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014
; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
@@ -3454,6 +3457,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
; GFX12-NEXT: v_lshrrev_b16 v9, 13, s3
; GFX12-NEXT: v_and_b32_e32 v44, 1, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v1, 1, s4
; GFX12-NEXT: s_lshr_b32 s5, s2, 24
; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2
@@ -3467,14 +3471,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v13, 7, s3
; GFX12-NEXT: v_lshrrev_b16 v14, 1, s3
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v2, 5, s5
; GFX12-NEXT: s_and_b32 s7, s2, 1
; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9
; GFX12-NEXT: v_and_b32_e32 v9, 1, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 3, s4
; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10017
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3
; GFX12-NEXT: v_lshrrev_b16 v3, 3, s5
; GFX12-NEXT: v_lshrrev_b16 v15, 3, s3
@@ -3489,30 +3495,34 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13
; GFX12-NEXT: v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17
; GFX12-NEXT: v_lshrrev_b16 v17, 6, s5
; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10016
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14
; GFX12-NEXT: s_bfe_u32 s14, s2, 0x10015
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11
; GFX12-NEXT: v_and_b32_e32 v11, 1, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 1, s5
; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10013
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7
; GFX12-NEXT: v_lshrrev_b16 v7, 7, s5
; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10012
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10
; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10011
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5
; GFX12-NEXT: s_bfe_u32 s20, s3, 0x10016
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v50, s20 :: v_dual_and_b32 v39, 1, v6
; GFX12-NEXT: v_lshrrev_b16 v6, 2, s5
; GFX12-NEXT: s_bfe_u32 s21, s3, 0x10014
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v48, s21 :: v_dual_and_b32 v43, 1, v4
; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
@@ -3522,7 +3532,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v60, s11 :: v_dual_and_b32 v19, 1, v15
; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 14, s2
@@ -3541,6 +3551,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015
; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v63, s8 :: v_dual_and_b32 v2, 1, v6
; GFX12-NEXT: v_and_b32_e32 v6, 1, v17
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v23
@@ -4266,6 +4277,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2
; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4
; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4
; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4
@@ -4311,7 +4323,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s20, s3, 0x10016
; GFX12-NEXT: s_bfe_i32 s21, s3, 0x10014
; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v49, s3
; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1
@@ -6791,6 +6803,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3
@@ -6808,6 +6821,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v28, 1, v21
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
@@ -6817,6 +6831,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
@@ -6827,6 +6842,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v9, 1, v17
; GFX12-NEXT: v_and_b32_e32 v29, 1, v23
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
@@ -6842,6 +6858,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26
@@ -7554,6 +7571,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v62, v[30:33], s[0:1] offset:64
; GFX12-NEXT: global_store_b128 v62, v[26:29], s[0:1] offset:48
; GFX12-NEXT: global_store_b128 v62, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v62, v[4:7], s[0:1] offset:16
@@ -8449,6 +8467,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v43, 1, v10
; GFX12-NEXT: v_dual_mov_b32 v68, v1 :: v_dual_and_b32 v69, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v62, v1 :: v_dual_and_b32 v71, 0xffff, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_and_b32 v67, 0xffff, v3
; GFX12-NEXT: v_mov_b32_e32 v66, v1
; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, v1
@@ -8457,6 +8476,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3
; GFX12-NEXT: v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v38, 1, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4
; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4
; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10016
@@ -8465,6 +8485,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v45, 1, v12
; GFX12-NEXT: v_and_b32_e32 v41, 1, v16
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: s_lshr_b32 s5, s2, 24
@@ -8473,6 +8494,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v40, 1, v8
; GFX12-NEXT: v_and_b32_e32 v44, 1, v14
; GFX12-NEXT: v_and_b32_e32 v14, 1, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5
; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5
; GFX12-NEXT: v_lshrrev_b16 v10, 3, s5
@@ -8483,6 +8505,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013
; GFX12-NEXT: v_and_b32_e32 v33, 1, v20
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_lshrrev_b16 v9, 15, s3
@@ -8509,6 +8532,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
@@ -8518,6 +8542,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v82, 0xffff, v35
; GFX12-NEXT: v_and_b32_e32 v35, 1, v27
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v27, v1
; GFX12-NEXT: v_and_b32_e32 v81, 0xffff, v4
@@ -8529,6 +8554,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v31
; GFX12-NEXT: v_and_b32_e32 v31, 1, v29
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
@@ -8538,6 +8564,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v21, 2, s2
; GFX12-NEXT: v_and_b32_e32 v33, 0xffff, v33
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: v_lshrrev_b16 v15, 8, s2
@@ -8561,6 +8588,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v39
; GFX12-NEXT: v_and_b32_e32 v39, 1, v25
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v77, 1, v7
; GFX12-NEXT: v_and_b32_e32 v79, 0xffff, v5
@@ -9818,6 +9846,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v75, s42 :: v_dual_mov_b32 v76, s43
; GFX12-NEXT: v_bfe_i32 v79, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v85, v65, 0, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v65, s40
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:144
@@ -9903,6 +9932,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v50, 31, v49
; GFX12-NEXT: v_ashrrev_i32_e32 v88, 31, v87
; GFX12-NEXT: v_ashrrev_i32_e32 v86, 31, v85
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v34, s19 :: v_dual_mov_b32 v17, s4
; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 355c296d122ff2..d5947aa790ceff 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2966,6 +2966,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
; GFX12-NEXT: s_lshr_b32 s25, s6, 16
; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
; GFX12-NEXT: v_mov_b32_e32 v10, s11
; GFX12-NEXT: s_lshr_b32 s22, s5, 16
@@ -3456,6 +3457,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: s_ashr_i32 s25, s6, 16
; GFX12-NEXT: s_sext_i32_i16 s7, s7
; GFX12-NEXT: s_sext_i32_i16 s6, s6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
; GFX12-NEXT: v_mov_b32_e32 v10, s11
; GFX12-NEXT: s_ashr_i32 s22, s5, 16
@@ -5787,10 +5789,10 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -6022,6 +6024,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
@@ -6362,23 +6365,27 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s2, 0xffff, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_and_b32 s3, 0xffff, s6
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s5
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -6958,36 +6965,43 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s5
; GFX12-NEXT: s_lshr_b32 s4, s7, 16
; GFX12-NEXT: s_and_b32 s5, s7, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s4, s6, 16
; GFX12-NEXT: s_and_b32 s5, s6, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96
; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_lshr_b32 s2, s1, 16
; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9]
@@ -8039,76 +8053,91 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s15, s14, 16
; GFX12-NEXT: s_and_b32 s14, s14, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:240
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s14
; GFX12-NEXT: v_mov_b32_e32 v2, s15
; GFX12-NEXT: s_lshr_b32 s14, s13, 16
; GFX12-NEXT: s_and_b32 s13, s13, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:224
; GFX12-NEXT: v_mov_b32_e32 v0, s13
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s14
; GFX12-NEXT: s_lshr_b32 s13, s12, 16
; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:208
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s12
; GFX12-NEXT: v_mov_b32_e32 v2, s13
; GFX12-NEXT: s_lshr_b32 s12, s11, 16
; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:192
; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s12
; GFX12-NEXT: s_lshr_b32 s11, s10, 16
; GFX12-NEXT: s_and_b32 s10, s10, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:176
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s10
; GFX12-NEXT: v_mov_b32_e32 v2, s11
; GFX12-NEXT: s_lshr_b32 s10, s9, 16
; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:160
; GFX12-NEXT: v_mov_b32_e32 v0, s9
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s9, s8, 16
; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:144
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: s_lshr_b32 s8, s7, 16
; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:128
; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: s_lshr_b32 s7, s6, 16
; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: v_mov_b32_e32 v2, s7
; GFX12-NEXT: s_lshr_b32 s6, s5, 16
; GFX12-NEXT: s_and_b32 s5, s5, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96
; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s5
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64
; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_lshr_b32 s2, s1, 16
; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32
; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17]
@@ -8918,6 +8947,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61
; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59
; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57
; GFX12-NEXT: v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55
@@ -8929,6 +8959,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:208
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:192
; GFX12-NEXT: v_dual_mov_b32 v1, s53 :: v_dual_mov_b32 v0, s52
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
; GFX12-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v4, s44
; GFX12-NEXT: v_dual_mov_b32 v7, s51 :: v_dual_mov_b32 v6, s50
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index f1a6bccc559f04..4ab55164e09992 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -4390,6 +4390,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s22
; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s23
; GFX12-NEXT: v_mov_b32_e32 v5, s56
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 67a376b8c0f3c5..7f26ad7009e44a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1119,8 +1119,8 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
; GFX12-NEXT: s_and_b32 s3, s2, 0xff
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -1223,8 +1223,8 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
; GFX12-NEXT: s_sext_i32_i8 s3, s2
; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
@@ -1332,6 +1332,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
@@ -1439,6 +1440,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: s_ashr_i32 s3, s2, 24
; GFX12-NEXT: s_sext_i32_i8 s4, s2
; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s2
@@ -1597,6 +1599,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5
; GFX12-NEXT: v_mov_b32_e32 v6, s3
@@ -1761,6 +1764,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: s_sext_i32_i8 s3, s3
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6
; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_mov_b32_e32 v4, s3
@@ -2018,11 +2022,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s3, s5, 24
; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: s_lshr_b32 s2, s4, 24
; GFX12-NEXT: s_and_b32 s10, s4, 0xff
; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2
; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5
; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9
@@ -2294,6 +2300,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: s_ashr_i32 s2, s4, 24
@@ -2305,6 +2312,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v8, s5
; GFX12-NEXT: v_mov_b32_e32 v10, s9
; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v12, s4
; GFX12-NEXT: v_mov_b32_e32 v14, s3
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
@@ -2753,7 +2761,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_and_b32 s21, s7, 0xff
; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13
; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
@@ -2767,6 +2775,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14
; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20
; GFX12-NEXT: s_lshr_b32 s3, s5, 24
; GFX12-NEXT: s_and_b32 s19, s5, 0xff
@@ -2776,6 +2785,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_and_b32 s18, s4, 0xff
; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
@@ -3263,6 +3273,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_sext_i32_i8 s7, s7
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22
; GFX12-NEXT: v_mov_b32_e32 v2, s11
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8
@@ -3276,6 +3287,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18
; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2
; GFX12-NEXT: v_mov_b32_e32 v30, s19
; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010
@@ -3288,6 +3300,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v20, s6
; GFX12-NEXT: v_mov_b32_e32 v22, s15
; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v16, s5
; GFX12-NEXT: v_mov_b32_e32 v18, s13
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
@@ -4116,11 +4129,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
; GFX12-NEXT: v_dual_mov_b32 v60, 0 :: v_dual_and_b32 v5, 0xffff, v5
; GFX12-NEXT: v_dual_mov_b32 v56, s50 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v58, s15
; GFX12-NEXT: s_and_b32 s43, s8, 0xff
; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
; GFX12-NEXT: s_and_b32 s48, s13, 0xff
; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v30, s43 :: v_dual_and_b32 v57, 0xffff, v0
; GFX12-NEXT: v_dual_mov_b32 v59, s34 :: v_dual_mov_b32 v32, s8
; GFX12-NEXT: s_lshr_b32 s27, s9, 24
@@ -4132,6 +4147,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
; GFX12-NEXT: s_and_b32 s47, s12, 0xff
; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v53, 0xffff, v2
; GFX12-NEXT: v_dual_mov_b32 v55, s33 :: v_dual_mov_b32 v26, s42
; GFX12-NEXT: s_lshr_b32 s25, s7, 24
@@ -4139,6 +4155,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_and_b32 v23, 0xffff, v12
; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v49, 0xffff, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v51, s31 :: v_dual_mov_b32 v28, s7
; GFX12-NEXT: s_lshr_b32 s28, s10, 24
; GFX12-NEXT: s_lshr_b32 s29, s11, 24
@@ -4148,6 +4165,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_and_b32 s45, s10, 0xff
; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
; GFX12-NEXT: s_and_b32 s46, s11, 0xff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v40, s10 :: v_dual_and_b32 v45, 0xffff, v4
; GFX12-NEXT: v_dual_mov_b32 v47, s30 :: v_dual_mov_b32 v22, s41
; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
@@ -4162,10 +4180,11 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v60, v[52:55], s[16:17] offset:224
; GFX12-NEXT: global_store_b128 v60, v[48:51], s[16:17] offset:208
; GFX12-NEXT: global_store_b128 v60, v[44:47], s[16:17] offset:192
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v44, s11 :: v_dual_mov_b32 v45, s29
; GFX12-NEXT: v_mov_b32_e32 v24, s6
; GFX12-NEXT: s_and_b32 s40, s5, 0xff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v41, s28 :: v_dual_mov_b32 v20, s40
; GFX12-NEXT: s_lshr_b32 s23, s5, 24
; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
@@ -4175,6 +4194,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_and_b32 s39, s4, 0xff
; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v33, s26 :: v_dual_mov_b32 v16, s39
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v29, s25 :: v_dual_mov_b32 v18, s4
; GFX12-NEXT: s_lshr_b32 s21, s3, 24
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
@@ -4187,10 +4207,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v60, v[26:29], s[16:17] offset:112
; GFX12-NEXT: global_store_b128 v60, v[22:25], s[16:17] offset:96
; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v14, s3
; GFX12-NEXT: s_lshr_b32 s20, s2, 24
; GFX12-NEXT: s_and_b32 s37, s2, 0xff
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v8, s37
; GFX12-NEXT: s_lshr_b32 s19, s1, 24
; GFX12-NEXT: s_and_b32 s36, s1, 0xff
@@ -4199,6 +4221,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s18, s0, 24
; GFX12-NEXT: s_and_b32 s35, s0, 0xff
; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v4, s36
; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s19
; GFX12-NEXT: v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18
@@ -5061,6 +5084,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_ashr_i32 s47, s14, 24
; GFX12-NEXT: s_bfe_i32 s48, s14, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s14, s14
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v59, 0 :: v_dual_mov_b32 v52, s15
; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11
; GFX12-NEXT: s_ashr_i32 s45, s13, 24
@@ -5080,6 +5104,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s42, s11, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s11, s11
; GFX12-NEXT: v_bfe_i32 v45, v3, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s43
; GFX12-NEXT: v_mov_b32_e32 v46, s46
; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
@@ -5104,12 +5129,14 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v59, v[40:43], s[16:17] offset:192
; GFX12-NEXT: v_mov_b32_e32 v41, s39
; GFX12-NEXT: v_dual_mov_b32 v55, s11 :: v_dual_mov_b32 v58, s41
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v37, s37
; GFX12-NEXT: s_ashr_i32 s33, s7, 24
; GFX12-NEXT: s_ashr_i32 s35, s8, 24
; GFX12-NEXT: s_bfe_i32 s36, s8, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s8, s8
; GFX12-NEXT: v_bfe_i32 v39, v7, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v33, s35
; GFX12-NEXT: v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v29, s33
; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3
@@ -5131,6 +5158,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s29, s5, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s5, s5
; GFX12-NEXT: v_bfe_i32 v31, v10, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v19, s26
; GFX12-NEXT: v_mov_b32_e32 v32, s36
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
@@ -5158,6 +5186,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28
; GFX12-NEXT: s_bfe_i32 s23, s2, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s2, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v16, s4
; GFX12-NEXT: v_mov_b32_e32 v18, s27
; GFX12-NEXT: s_bfe_i32 s21, s1, 0x80010
@@ -5171,6 +5200,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v8, s2
; GFX12-NEXT: v_mov_b32_e32 v10, s23
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s21
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
@@ -5869,7 +5899,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_lshr_b32 s4, s2, 24
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
@@ -5877,6 +5907,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_and_b32 s2, s2, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -6027,6 +6058,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s6
@@ -6225,26 +6257,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX12-NEXT: s_lshr_b32 s5, s3, 24
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_lshr_b32 s4, s2, 24
; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s2, s2, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX12-NEXT: s_and_b32 s2, s3, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
@@ -6490,7 +6525,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8
; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9
@@ -6831,47 +6866,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_lshr_b32 s3, s7, 24
; GFX12-NEXT: s_lshr_b32 s2, s5, 24
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s2, s6, 24
; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s2, s4, 24
; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_and_b32 s2, s6, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5
; GFX12-NEXT: s_and_b32 s2, s7, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
; GFX12-NEXT: s_and_b32 s2, s5, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX12-NEXT: s_and_b32 s2, s4, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -7303,6 +7346,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5
@@ -7939,48 +7983,56 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s10, s7, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
; GFX12-NEXT: s_lshr_b32 s11, s7, 24
; GFX12-NEXT: s_lshr_b32 s10, s5, 24
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s11, s5, 0x80010
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s7
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
; GFX12-NEXT: s_and_b32 s7, s7, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:240
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s3, 24
; GFX12-NEXT: s_bfe_u32 s11, s3, 0x80010
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:176
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s1, 24
; GFX12-NEXT: s_bfe_u32 s11, s1, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s6, 24
; GFX12-NEXT: s_bfe_u32 s11, s6, 0x80010
; GFX12-NEXT: s_and_b32 s6, s6, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s4, 24
; GFX12-NEXT: s_bfe_u32 s11, s4, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:208
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s2, 24
; GFX12-NEXT: s_bfe_u32 s11, s2, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:144
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s0, 24
; GFX12-NEXT: s_bfe_u32 s11, s0, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
@@ -7996,6 +8048,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
; GFX12-NEXT: s_and_b32 s4, s4, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:192
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
@@ -8008,6 +8061,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
; GFX12-NEXT: s_and_b32 s2, s2, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:128
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
@@ -8020,6 +8074,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0
; GFX12-NEXT: s_and_b32 s0, s0, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
@@ -8866,6 +8921,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v60, s42 :: v_dual_mov_b32 v29, s47
; GFX12-NEXT: v_dual_mov_b32 v28, s46 :: v_dual_mov_b32 v63, s45
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v21, s5
; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v17, s15
; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v9, s13
@@ -9605,14 +9661,16 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2
; GFX12-NEXT: s_lshr_b32 s2, s2, 24
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1
; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -9758,11 +9816,13 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4
; GFX12-NEXT: s_ashr_i32 s2, s2, 24
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -9961,6 +10021,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX12-NEXT: s_lshr_b32 s2, s3, 24
; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0
; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3
; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
@@ -10187,8 +10248,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX12-NEXT: s_ashr_i32 s2, s2, 24
; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000
; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5
@@ -10537,6 +10600,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5
; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11
; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12
; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9
@@ -10926,6 +10990,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5
; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000
; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6
; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000
@@ -10937,14 +11002,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5
; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12
; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6
; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11
; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10
; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000
; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3
; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2
; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
@@ -11566,6 +11633,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v6
; GFX12-NEXT: v_lshl_or_b32 v6, v4, 16, v10
; GFX12-NEXT: v_lshl_or_b32 v4, v3, 16, v11
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v3, s24, 16, v14
; GFX12-NEXT: v_lshl_or_b32 v10, v9, 16, v12
; GFX12-NEXT: v_lshl_or_b32 v8, v8, 16, v13
@@ -11580,6 +11648,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s16, s5, 24
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v11, s16, 16, v17
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
@@ -11593,11 +11662,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s18, s2, 24
; GFX12-NEXT: v_lshl_or_b32 v14, v5, 16, v9
; GFX12-NEXT: v_lshl_or_b32 v12, v1, 16, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v15, s12, 16, v13
; GFX12-NEXT: v_lshl_or_b32 v13, s10, 16, v17
; GFX12-NEXT: s_lshr_b32 s22, s0, 24
; GFX12-NEXT: v_lshl_or_b32 v9, s14, 16, v19
; GFX12-NEXT: v_lshl_or_b32 v5, s18, 16, v18
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v1, s22, 16, v20
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48
@@ -12316,6 +12387,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0
; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80000
; GFX12-NEXT: v_ashrrev_i16 v5, 8, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_ashr_i64 s[0:1], s[4:5], 56
; GFX12-NEXT: v_and_b32_e64 v10, 0xffff, s2
; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s20
@@ -12323,6 +12395,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i16 v3, 8, s3
; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
; GFX12-NEXT: s_bfe_i32 s2, s15, 0x80000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s12, 0x80000
; GFX12-NEXT: v_and_b32_e64 v2, 0xffff, s18
@@ -12333,9 +12406,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s2
; GFX12-NEXT: v_lshl_or_b32 v4, v4, 16, v10
; GFX12-NEXT: v_lshl_or_b32 v10, v5, 16, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s7, 0x80000
; GFX12-NEXT: s_lshr_b32 s11, s7, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000
; GFX12-NEXT: s_lshr_b32 s10, s6, 16
@@ -12344,9 +12419,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v8
; GFX12-NEXT: v_lshl_or_b32 v8, v7, 16, v13
; GFX12-NEXT: v_lshl_or_b32 v7, v11, 16, v15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s11, 0x80000
; GFX12-NEXT: s_lshr_b32 s13, s5, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v22, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s10, 0x80000
; GFX12-NEXT: v_ashrrev_i16 v9, 8, s17
@@ -12355,6 +12432,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6
; GFX12-NEXT: v_ashrrev_i16 v21, 8, s11
; GFX12-NEXT: v_ashrrev_i16 v23, 8, s10
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v24, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s5, s16, 0x80000
; GFX12-NEXT: v_ashrrev_i16 v1, 8, s12
@@ -12362,11 +12440,13 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s4, s13, 0x80000
; GFX12-NEXT: v_and_b32_e64 v20, 0xffff, s3
; GFX12-NEXT: v_ashrrev_i16 v17, 8, s16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v19, 0xffff, s5
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s4, s0
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: v_lshl_or_b32 v3, v9, 16, v14
; GFX12-NEXT: v_lshl_or_b32 v14, v11, 16, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v11, s0
; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v15
; GFX12-NEXT: v_lshl_or_b32 v15, v21, 16, v22
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 65614a17fc0114..aeb88bcbe0028e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -503,12 +503,14 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f64:
@@ -694,12 +696,14 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f64__offset:
@@ -884,12 +888,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f64:
@@ -1066,12 +1072,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f64__offset:
@@ -1267,13 +1275,15 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f16:
@@ -1573,13 +1583,15 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f16__offset:
@@ -1887,12 +1899,14 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f16:
@@ -2181,12 +2195,14 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f16__offset:
@@ -2474,13 +2490,15 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f16__offset__align4:
@@ -2710,12 +2728,14 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f16__offset__align4:
@@ -2958,13 +2978,15 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_bf16:
@@ -3312,13 +3334,15 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset:
@@ -3674,12 +3698,14 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_bf16:
@@ -4016,12 +4042,14 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset:
@@ -4357,13 +4385,15 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
@@ -4648,12 +4678,14 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
@@ -7018,11 +7050,12 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX12-NEXT: s_cbranch_execz .LBB28_2
; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5
; GFX12-NEXT: s_lshl_b32 s5, s1, 3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1
@@ -7030,25 +7063,27 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB28_2:
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_mov_b32 s7, exec_lo
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
; GFX12-NEXT: s_mov_b32 s6, exec_lo
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX12-NEXT: s_cbranch_execz .LBB28_4
; GFX12-NEXT: ; %bb.3:
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GFX12-NEXT: s_lshl_b32 s0, s1, 4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_f32 v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB28_4:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
@@ -7061,22 +7096,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: ; implicit-def: $vgpr0
; GFX12-NEXT: .LBB28_5: ; %ComputeLoop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_ctz_i32_b32 s5, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s5
; GFX12-NEXT: s_lshl_b32 s7, 1, s5
; GFX12-NEXT: v_writelane_b32 v0, s0, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB28_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12-NEXT: ; implicit-def: $vgpr1
; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12-NEXT: s_cbranch_execz .LBB28_8
; GFX12-NEXT: ; %bb.7:
@@ -7086,6 +7125,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB28_8:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v1
@@ -7885,32 +7925,36 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX12-NEXT: s_cbranch_execz .LBB29_2
; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5
; GFX12-NEXT: s_lshl_b32 s5, s1, 3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1
; GFX12-NEXT: .LBB29_2:
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, exec_lo
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
; GFX12-NEXT: s_mov_b32 s6, exec_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX12-NEXT: s_cbranch_execz .LBB29_4
; GFX12-NEXT: ; %bb.3:
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GFX12-NEXT: s_lshl_b32 s0, s1, 4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: ds_add_f32 v2, v1
; GFX12-NEXT: .LBB29_4:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
@@ -7923,28 +7967,33 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: ; implicit-def: $vgpr0
; GFX12-NEXT: .LBB29_5: ; %ComputeLoop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_ctz_i32_b32 s5, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s5
; GFX12-NEXT: s_lshl_b32 s7, 1, s5
; GFX12-NEXT: v_writelane_b32 v0, s0, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB29_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12-NEXT: ; implicit-def: $vgpr1
; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12-NEXT: s_cbranch_execz .LBB29_8
; GFX12-NEXT: ; %bb.7:
; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2
; GFX12-NEXT: .LBB29_8:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 6dec36c316ee31..cc79db1b20af46 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -816,13 +816,15 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16:
@@ -1129,13 +1131,15 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16__offset:
@@ -1450,12 +1454,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16:
@@ -1752,12 +1758,14 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16__offset:
@@ -2053,13 +2061,15 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16__offset__align4:
@@ -2297,12 +2307,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16__offset__align4:
@@ -2552,13 +2564,15 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16:
@@ -2908,13 +2922,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset:
@@ -3272,12 +3288,14 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16:
@@ -3616,12 +3634,14 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset:
@@ -3959,13 +3979,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
@@ -4252,12 +4274,14 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
@@ -4531,13 +4555,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2f16:
@@ -4802,13 +4828,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2f16__offset:
@@ -5073,12 +5101,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2f16:
@@ -5334,12 +5364,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2f16__offset:
@@ -5618,13 +5650,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2bf16:
@@ -5994,13 +6028,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2bf16__offset:
@@ -6370,12 +6406,14 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2bf16:
@@ -6733,12 +6771,14 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index b3132a2fa80dd2..1ffd93e35d8cd9 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -816,13 +816,15 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16:
@@ -1129,13 +1131,15 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16__offset:
@@ -1450,12 +1454,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16:
@@ -1752,12 +1758,14 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16__offset:
@@ -2053,13 +2061,15 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16__offset__align4:
@@ -2297,12 +2307,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16__offset__align4:
@@ -2552,13 +2564,15 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16:
@@ -2908,13 +2922,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset:
@@ -3272,12 +3288,14 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16:
@@ -3616,12 +3634,14 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset:
@@ -3959,13 +3979,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
@@ -4252,12 +4274,14 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
@@ -4531,13 +4555,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2f16:
@@ -4802,13 +4828,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2f16__offset:
@@ -5073,12 +5101,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2f16:
@@ -5334,12 +5364,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2f16__offset:
@@ -5618,13 +5650,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2bf16:
@@ -5994,13 +6028,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2bf16__offset:
@@ -6370,12 +6406,14 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2bf16:
@@ -6733,12 +6771,14 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 5ebeddd04b2ae8..9bc8bafc34a68f 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -35,13 +35,15 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f32:
@@ -246,13 +248,15 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f32__offset:
@@ -457,12 +461,14 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f32:
@@ -657,12 +663,14 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f32__offset:
@@ -865,12 +873,14 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f64:
@@ -1081,12 +1091,14 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f64__offset:
@@ -1296,12 +1308,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f64:
@@ -1501,12 +1515,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f64__offset:
@@ -1725,13 +1741,15 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f16:
@@ -2031,13 +2049,15 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f16__offset:
@@ -2345,12 +2365,14 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f16:
@@ -2639,12 +2661,14 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f16__offset:
@@ -2932,13 +2956,15 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f16__offset__align4:
@@ -3168,12 +3194,14 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f16__offset__align4:
@@ -3416,13 +3444,15 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_bf16:
@@ -3770,13 +3800,15 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset:
@@ -4132,12 +4164,14 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_bf16:
@@ -4474,12 +4508,14 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset:
@@ -4815,13 +4851,15 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
@@ -5106,12 +5144,14 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
@@ -5381,13 +5421,15 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_v2f16:
@@ -5635,13 +5677,15 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_v2f16__offset:
@@ -5888,12 +5932,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_v2f16:
@@ -6130,12 +6176,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_v2f16__offset:
@@ -6398,13 +6446,15 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_v2bf16:
@@ -6774,13 +6824,15 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_v2bf16__offset:
@@ -7150,12 +7202,14 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_v2bf16:
@@ -7513,12 +7567,14 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
@@ -7864,13 +7920,15 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode:
@@ -8074,12 +8132,14 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 390d1d70ff2aae..df954f6f940c8c 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
; GCN-NEXT: .LBB0_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
@@ -149,6 +149,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
; GCN-NEXT: .LBB3_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_add_co_i32 s2, s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index fef1b57db5685d..acba2841a71073 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -200,6 +200,7 @@ define amdgpu_kernel void @caller() {
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -212,6 +213,7 @@ define amdgpu_kernel void @caller() {
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX12-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -276,9 +278,10 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index f90753652baa5e..1da05ed264a647 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -107,6 +107,7 @@ define amdgpu_cs void @caller() {
; GFX12-SDAG-NEXT: s_mov_b32 s1, callee at abs32@hi
; GFX12-SDAG-NEXT: s_mov_b32 s0, callee at abs32@lo
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -116,6 +117,7 @@ define amdgpu_cs void @caller() {
; GFX12-GISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
; GFX12-GISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -182,9 +184,10 @@ define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 94d1eca05ed0ec..8a789a4c6cda9b 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -106,17 +106,20 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v1, s59, 0
; GFX12-NEXT: s_add_co_ci_u32 s0, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
; GFX12-NEXT: s_bitset0_b32 s0, 0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s59, scc
@@ -124,8 +127,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
; GFX12-NEXT: v_readlane_b32 s59, v1, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
@@ -313,10 +318,12 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v1, s59, 0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
@@ -327,8 +334,10 @@ define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
; GFX12-NEXT: v_readlane_b32 s59, v1, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc:
@@ -530,17 +539,20 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v1, s59, 0
; GFX12-NEXT: s_add_co_ci_u32 s0, s33, 0x4000
; GFX12-NEXT: v_mov_b32_e32 v0, s33
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bitcmp1_b32 s0, 0
; GFX12-NEXT: s_bitset0_b32 s0, 0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s59, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s59, scc
@@ -548,10 +560,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
; GFX12-NEXT: v_readlane_b32 s59, v1, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0
; GFX12-NEXT: s_mov_b32 s33, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
@@ -745,6 +759,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v0, s59, 0
; GFX12-NEXT: s_mov_b32 s59, s32
@@ -756,8 +771,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset()
; GFX12-NEXT: v_readlane_b32 s59, v0, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
@@ -911,6 +928,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v0, s59, 0
; GFX12-NEXT: s_mov_b32 s59, s32
@@ -921,8 +939,10 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0
; GFX12-NEXT: v_readlane_b32 s59, v0, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
@@ -1092,6 +1112,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v0, s59, 0
; GFX12-NEXT: s_addk_co_i32 s32, 0x4040
@@ -1103,10 +1124,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
; GFX12-NEXT: v_readlane_b32 s59, v0, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0
; GFX12-NEXT: s_mov_b32 s33, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
@@ -1292,6 +1315,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX12-NEXT: s_mov_b32 s33, s32
; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
; GFX12-NEXT: scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: v_writelane_b32 v0, s59, 0
; GFX12-NEXT: s_mov_b32 s59, s33
@@ -1303,10 +1327,12 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
; GFX12-NEXT: v_readlane_b32 s59, v0, 0
; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
; GFX12-NEXT: scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: s_addk_co_i32 s32, 0xbfc0
; GFX12-NEXT: s_mov_b32 s33, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
@@ -1492,9 +1518,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
; GFX12-NEXT: v_writelane_b32 v2, s59, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
@@ -1509,8 +1537,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX12-NEXT: v_readlane_b32 s59, v2, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
@@ -1710,10 +1740,12 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s1, -1
; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_add_co_i32 s1, s32, 0x4000
; GFX12-NEXT: v_writelane_b32 v2, s59, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_add_nc_u32_e64 v1, s0, s1
; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
@@ -1728,8 +1760,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
; GFX12-NEXT: v_readlane_b32 s59, v2, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 6346406fa89410..9829b7e787d479 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -673,6 +673,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v23, s30, 0
; GFX12-NEXT: v_mov_b32_e32 v0, s32
@@ -711,13 +712,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0x4000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bitcmp1_b32 s32, 0
; GFX12-NEXT: v_writelane_b32 v23, s59, 28
; GFX12-NEXT: s_bitset0_b32 s32, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s59, s32
; GFX12-NEXT: s_add_co_ci_u32 s32, s32, 0xffffc000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bitcmp1_b32 s32, 0
; GFX12-NEXT: s_bitset0_b32 s32, 0
; GFX12-NEXT: ;;#ASMSTART
@@ -754,8 +756,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0
; GFX12-NEXT: v_readlane_b32 s30, v23, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca i32, align 4, addrspace(5)
@@ -1396,6 +1400,7 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v21, s30, 0
; GFX12-NEXT: v_writelane_b32 v21, s31, 1
@@ -1466,8 +1471,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX12-NEXT: v_readlane_b32 s30, v21, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 16, addrspace(5)
@@ -2196,16 +2203,18 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v23, s30, 0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v23, s31, 1
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use alloca0 v1
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
; GFX12-NEXT: v_writelane_b32 v23, s33, 2
; GFX12-NEXT: v_writelane_b32 v23, s34, 3
@@ -2271,8 +2280,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: v_readlane_b32 s30, v23, 0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%alloca0 = alloca [4096 x i32], align 64, addrspace(5)
%alloca1 = alloca [4096 x i32], align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 45e8b3bcff13c5..e9b4ec52599a01 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX12-WGP-LABEL: flat_agent_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX12-CU-LABEL: flat_agent_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX12-WGP-LABEL: flat_agent_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX12-CU-LABEL: flat_agent_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -547,6 +551,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX12-WGP-LABEL: flat_agent_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -565,6 +570,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX12-CU-LABEL: flat_agent_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -765,6 +771,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-WGP-LABEL: flat_agent_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -787,6 +794,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-LABEL: flat_agent_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1550,6 +1558,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1562,6 +1571,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1729,6 +1739,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1743,6 +1754,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_agent_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1901,6 +1913,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1918,6 +1931,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-LABEL: flat_agent_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2106,6 +2120,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2125,6 +2140,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2315,6 +2331,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2334,6 +2351,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -3215,6 +3233,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3231,6 +3250,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3473,6 +3493,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3491,6 +3512,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3724,6 +3746,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3745,6 +3768,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4008,6 +4032,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4031,6 +4056,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4296,6 +4322,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4319,6 +4346,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4568,6 +4596,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4586,6 +4615,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4830,6 +4860,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4848,6 +4879,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5108,6 +5140,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5131,6 +5164,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5396,6 +5430,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5419,6 +5454,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5684,6 +5720,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5707,6 +5744,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5972,6 +6010,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5995,6 +6034,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6260,6 +6300,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6283,6 +6324,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6548,6 +6590,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6571,6 +6614,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6836,6 +6880,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6859,6 +6904,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7124,6 +7170,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7147,6 +7194,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7401,6 +7449,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7421,6 +7470,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7687,6 +7737,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7710,6 +7761,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7982,6 +8034,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8007,6 +8060,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8294,6 +8348,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8322,6 +8377,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8612,6 +8668,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8640,6 +8697,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8914,6 +8972,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8937,6 +8996,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9206,6 +9266,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9229,6 +9290,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9514,6 +9576,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9542,6 +9605,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9832,6 +9896,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9860,6 +9925,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10150,6 +10216,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10178,6 +10245,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10468,6 +10536,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10496,6 +10565,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10786,6 +10856,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10814,6 +10885,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11104,6 +11176,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11132,6 +11205,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11422,6 +11496,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11450,6 +11525,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11740,6 +11816,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11768,6 +11845,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11953,6 +12031,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_agent_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11968,6 +12047,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX12-CU-LABEL: flat_agent_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12138,6 +12218,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12153,6 +12234,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12344,6 +12426,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12363,6 +12446,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12572,6 +12656,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12595,6 +12680,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13359,6 +13445,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13371,6 +13458,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13534,6 +13622,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13548,6 +13637,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13706,6 +13796,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13723,6 +13814,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13907,6 +13999,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13926,6 +14019,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -14112,6 +14206,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -14131,6 +14226,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -15042,6 +15138,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15058,6 +15155,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15296,6 +15394,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15314,6 +15413,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15547,6 +15647,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15568,6 +15669,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15827,6 +15929,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15850,6 +15953,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16111,6 +16215,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16134,6 +16239,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16379,6 +16485,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16397,6 +16504,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16637,6 +16745,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16655,6 +16764,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16911,6 +17021,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16934,6 +17045,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17195,6 +17307,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17218,6 +17331,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17479,6 +17593,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17502,6 +17617,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17763,6 +17879,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17786,6 +17903,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18047,6 +18165,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18070,6 +18189,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18331,6 +18451,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18354,6 +18475,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18615,6 +18737,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18638,6 +18761,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18899,6 +19023,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18922,6 +19047,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19176,6 +19302,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19196,6 +19323,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19470,6 +19598,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19494,6 +19623,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19767,6 +19897,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19792,6 +19923,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20087,6 +20219,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20116,6 +20249,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20415,6 +20549,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20444,6 +20579,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20727,6 +20863,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20751,6 +20888,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21029,6 +21167,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21053,6 +21192,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21347,6 +21487,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21376,6 +21517,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21675,6 +21817,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21704,6 +21847,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22003,6 +22147,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22032,6 +22177,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22331,6 +22477,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22360,6 +22507,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22659,6 +22807,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22688,6 +22837,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22987,6 +23137,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23016,6 +23167,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23315,6 +23467,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23344,6 +23497,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23643,6 +23797,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23672,6 +23827,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index fb40274cac1bac..5c59481c598539 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -29,18 +30,23 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-NEXT: s_mov_b32 s2, 2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-NEXT: s_mov_b32 s2, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-NEXT: v_mov_b32_e32 v2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s3, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_mov_b32 s2, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -64,6 +70,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -88,6 +95,7 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out)
; GFX12-LABEL: flat_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 5fa8e6891bafb1..b2340caa2933f9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-WGP-LABEL: flat_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-LABEL: flat_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -475,18 +477,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, s4
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1
; GFX12-WGP-NEXT: s_mov_b32 s2, s5
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -504,18 +511,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_mov_b32 s2, 0
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr2
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, s4
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1
; GFX12-CU-NEXT: s_mov_b32 s2, s5
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -688,6 +700,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-WGP-LABEL: flat_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -703,6 +716,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-LABEL: flat_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1007,17 +1021,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-WGP-NEXT: s_mov_b32 s0, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-WGP-NEXT: s_mov_b32 s0, 0
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
; GFX12-WGP-NEXT: s_mov_b32 s1, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3
; GFX12-WGP-NEXT: s_mov_b32 s0, s3
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -1036,17 +1055,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-CU-NEXT: s_mov_b32 s0, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-CU-NEXT: s_mov_b32 s0, 0
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
; GFX12-CU-NEXT: s_mov_b32 s1, s2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3
; GFX12-CU-NEXT: s_mov_b32 s0, s3
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -1224,6 +1248,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1242,6 +1267,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 4c9ce15211e34c..304c80d7bb24d4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX12-WGP-LABEL: flat_singlethread_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX12-CU-LABEL: flat_singlethread_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX12-CU-LABEL: flat_singlethread_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -534,6 +538,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX12-WGP-LABEL: flat_singlethread_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -549,6 +554,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX12-CU-LABEL: flat_singlethread_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -719,6 +725,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -734,6 +741,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1438,6 +1446,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1450,6 +1459,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1590,6 +1600,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1602,6 +1613,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1742,6 +1754,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1754,6 +1767,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1894,6 +1908,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1906,6 +1921,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2046,6 +2062,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2058,6 +2075,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2823,6 +2841,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -2839,6 +2858,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3054,6 +3074,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3070,6 +3091,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3285,6 +3307,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3301,6 +3324,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3516,6 +3540,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3532,6 +3557,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3747,6 +3773,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3763,6 +3790,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3978,6 +4006,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3994,6 +4023,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4209,6 +4239,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4225,6 +4256,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4440,6 +4472,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4456,6 +4489,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4671,6 +4705,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4687,6 +4722,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4902,6 +4938,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4918,6 +4955,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5133,6 +5171,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5149,6 +5188,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5364,6 +5404,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5380,6 +5421,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5595,6 +5637,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5611,6 +5654,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5826,6 +5870,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5842,6 +5887,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6103,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6073,6 +6120,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6320,6 +6368,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6340,6 +6389,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6593,6 +6643,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6613,6 +6664,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6866,6 +6918,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6886,6 +6939,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7139,6 +7193,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7159,6 +7214,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7412,6 +7468,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7432,6 +7489,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7685,6 +7743,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7705,6 +7764,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7958,6 +8018,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7978,6 +8039,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8231,6 +8293,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8251,6 +8314,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8504,6 +8568,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8524,6 +8589,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8777,6 +8843,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8797,6 +8864,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9050,6 +9118,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9070,6 +9139,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9393,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9343,6 +9414,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9596,6 +9668,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9616,6 +9689,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9869,6 +9943,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9889,6 +9964,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10142,6 +10218,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10162,6 +10239,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10339,6 +10417,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10354,6 +10433,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10524,6 +10604,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10539,6 +10620,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10709,6 +10791,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10724,6 +10807,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10894,6 +10978,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10909,6 +10994,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11613,6 +11699,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11625,6 +11712,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11765,6 +11853,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11777,6 +11866,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11917,6 +12007,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11929,6 +12020,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12069,6 +12161,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12081,6 +12174,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12221,6 +12315,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12233,6 +12328,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12998,6 +13094,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13014,6 +13111,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13229,6 +13327,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13245,6 +13344,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13460,6 +13560,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13476,6 +13577,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13691,6 +13793,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13707,6 +13810,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13922,6 +14026,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13938,6 +14043,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14153,6 +14259,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14169,6 +14276,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14384,6 +14492,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14400,6 +14509,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14615,6 +14725,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14631,6 +14742,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14846,6 +14958,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14862,6 +14975,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15077,6 +15191,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15093,6 +15208,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15308,6 +15424,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15324,6 +15441,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15539,6 +15657,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15555,6 +15674,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15770,6 +15890,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15786,6 +15907,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16001,6 +16123,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16017,6 +16140,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16232,6 +16356,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16248,6 +16373,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16495,6 +16621,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16515,6 +16642,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16768,6 +16896,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16788,6 +16917,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17041,6 +17171,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17061,6 +17192,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17314,6 +17446,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17334,6 +17467,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17587,6 +17721,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17607,6 +17742,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17860,6 +17996,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17880,6 +18017,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18133,6 +18271,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18153,6 +18292,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18406,6 +18546,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18426,6 +18567,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18679,6 +18821,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18699,6 +18842,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18952,6 +19096,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18972,6 +19117,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19225,6 +19371,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19245,6 +19392,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19498,6 +19646,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19518,6 +19667,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19771,6 +19921,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19791,6 +19942,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20044,6 +20196,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20064,6 +20217,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20317,6 +20471,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20337,6 +20492,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index e77f1432c1c9d0..3502a29edeecb3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX12-WGP-LABEL: flat_system_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX12-CU-LABEL: flat_system_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX12-WGP-LABEL: flat_system_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX12-CU-LABEL: flat_system_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -549,6 +553,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX12-WGP-LABEL: flat_system_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -567,6 +572,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX12-CU-LABEL: flat_system_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -769,6 +775,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-WGP-LABEL: flat_system_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -791,6 +798,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-LABEL: flat_system_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1558,6 +1566,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_system_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1570,6 +1579,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1739,6 +1749,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_system_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1753,6 +1764,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_system_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1913,6 +1925,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-WGP-LABEL: flat_system_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1930,6 +1943,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-LABEL: flat_system_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2122,6 +2136,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_system_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2141,6 +2156,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_system_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2335,6 +2351,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_system_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2354,6 +2371,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_system_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -3245,6 +3263,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3261,6 +3280,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3505,6 +3525,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3523,6 +3544,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3758,6 +3780,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3779,6 +3802,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4046,6 +4070,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4069,6 +4094,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4338,6 +4364,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4361,6 +4388,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4612,6 +4640,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4630,6 +4659,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4876,6 +4906,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4894,6 +4925,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5158,6 +5190,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5181,6 +5214,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5450,6 +5484,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5473,6 +5508,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5742,6 +5778,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5765,6 +5802,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6034,6 +6072,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6096,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6326,6 +6366,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6349,6 +6390,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6618,6 +6660,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6641,6 +6684,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6910,6 +6954,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6933,6 +6978,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7202,6 +7248,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7225,6 +7272,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7479,6 +7527,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7499,6 +7548,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7767,6 +7817,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7790,6 +7841,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8064,6 +8116,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8089,6 +8142,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8380,6 +8434,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8408,6 +8463,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8702,6 +8758,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8730,6 +8787,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9006,6 +9064,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9029,6 +9088,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9300,6 +9360,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9384,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9612,6 +9674,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9640,6 +9703,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9934,6 +9998,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9962,6 +10027,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10256,6 +10322,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10284,6 +10351,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10578,6 +10646,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10606,6 +10675,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10900,6 +10970,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10928,6 +10999,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11222,6 +11294,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11250,6 +11323,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11544,6 +11618,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11572,6 +11647,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11866,6 +11942,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11894,6 +11971,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -12079,6 +12157,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_system_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12094,6 +12173,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX12-CU-LABEL: flat_system_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12264,6 +12344,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12279,6 +12360,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12472,6 +12554,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12491,6 +12574,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX12-CU-LABEL: flat_system_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12702,6 +12786,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12725,6 +12810,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13493,6 +13579,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13505,6 +13592,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13670,6 +13758,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13684,6 +13773,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13844,6 +13934,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13861,6 +13952,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -14049,6 +14141,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -14068,6 +14161,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -14258,6 +14352,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -14277,6 +14372,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -15198,6 +15294,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15214,6 +15311,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15454,6 +15552,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15472,6 +15571,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15707,6 +15807,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15728,6 +15829,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15991,6 +16093,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16014,6 +16117,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16279,6 +16383,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16302,6 +16407,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16549,6 +16655,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16567,6 +16674,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16809,6 +16917,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16827,6 +16936,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17087,6 +17197,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17110,6 +17221,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17375,6 +17487,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17398,6 +17511,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17663,6 +17777,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17686,6 +17801,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17951,6 +18067,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17974,6 +18091,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18239,6 +18357,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18262,6 +18381,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18527,6 +18647,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18550,6 +18671,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18815,6 +18937,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18838,6 +18961,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19103,6 +19227,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19126,6 +19251,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19380,6 +19506,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19400,6 +19527,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19676,6 +19804,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19700,6 +19829,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19975,6 +20105,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20000,6 +20131,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20299,6 +20431,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20328,6 +20461,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20631,6 +20765,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20660,6 +20795,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20945,6 +21081,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20969,6 +21106,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21249,6 +21387,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21273,6 +21412,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21571,6 +21711,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21600,6 +21741,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21903,6 +22045,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21932,6 +22075,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22235,6 +22379,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22264,6 +22409,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22567,6 +22713,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22596,6 +22743,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22899,6 +23047,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22928,6 +23077,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23231,6 +23381,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23260,6 +23411,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23563,6 +23715,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23592,6 +23745,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23895,6 +24049,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23924,6 +24079,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 6bf54ccabc9dad..c2b7aa4fcfbf1e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -110,6 +110,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-WGP-LABEL: flat_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -128,6 +129,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-LABEL: flat_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -329,18 +331,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, s4
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1
; GFX12-WGP-NEXT: s_mov_b32 s2, s5
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -361,18 +368,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_mov_b32 s2, 0
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr2
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, s4
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1
; GFX12-CU-NEXT: s_mov_b32 s2, s5
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -498,6 +510,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-WGP-LABEL: flat_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -518,6 +531,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-LABEL: flat_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -727,17 +741,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-WGP-NEXT: s_mov_b32 s0, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-WGP-NEXT: s_mov_b32 s0, 0
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
; GFX12-WGP-NEXT: s_mov_b32 s1, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3
; GFX12-WGP-NEXT: s_mov_b32 s0, s3
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -761,17 +780,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-CU-NEXT: s_mov_b32 s0, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-CU-NEXT: s_mov_b32 s0, 0
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
; GFX12-CU-NEXT: s_mov_b32 s1, s2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3
; GFX12-CU-NEXT: s_mov_b32 s0, s3
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -896,6 +920,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -914,6 +939,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index c7826181cc8dde..23982f8a00cdb8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX12-WGP-LABEL: flat_wavefront_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX12-CU-LABEL: flat_wavefront_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX12-CU-LABEL: flat_wavefront_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -534,6 +538,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX12-WGP-LABEL: flat_wavefront_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -549,6 +554,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX12-CU-LABEL: flat_wavefront_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -719,6 +725,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -734,6 +741,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1438,6 +1446,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1450,6 +1459,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1590,6 +1600,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1602,6 +1613,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1742,6 +1754,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1754,6 +1767,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1894,6 +1908,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1906,6 +1921,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2046,6 +2062,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2058,6 +2075,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2823,6 +2841,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -2839,6 +2858,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3054,6 +3074,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3070,6 +3091,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3285,6 +3307,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3301,6 +3324,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3516,6 +3540,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3532,6 +3557,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3747,6 +3773,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3763,6 +3790,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3978,6 +4006,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3994,6 +4023,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4209,6 +4239,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4225,6 +4256,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4440,6 +4472,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4456,6 +4489,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4671,6 +4705,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4687,6 +4722,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4902,6 +4938,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4918,6 +4955,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5133,6 +5171,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5149,6 +5188,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5364,6 +5404,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5380,6 +5421,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5595,6 +5637,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5611,6 +5654,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5826,6 +5870,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5842,6 +5887,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6103,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6073,6 +6120,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6320,6 +6368,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6340,6 +6389,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6593,6 +6643,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6613,6 +6664,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6866,6 +6918,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6886,6 +6939,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7139,6 +7193,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7159,6 +7214,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7412,6 +7468,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7432,6 +7489,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7685,6 +7743,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7705,6 +7764,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7958,6 +8018,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7978,6 +8039,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8231,6 +8293,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8251,6 +8314,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8504,6 +8568,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8524,6 +8589,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8777,6 +8843,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8797,6 +8864,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9050,6 +9118,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9070,6 +9139,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9393,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9343,6 +9414,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9596,6 +9668,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9616,6 +9689,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9869,6 +9943,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9889,6 +9964,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10142,6 +10218,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10162,6 +10239,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10339,6 +10417,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10354,6 +10433,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10524,6 +10604,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10539,6 +10620,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10709,6 +10791,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10724,6 +10807,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10894,6 +10978,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10909,6 +10994,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11613,6 +11699,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11625,6 +11712,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11765,6 +11853,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11777,6 +11866,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11917,6 +12007,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11929,6 +12020,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12069,6 +12161,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12081,6 +12174,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12221,6 +12315,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12233,6 +12328,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12998,6 +13094,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13014,6 +13111,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13229,6 +13327,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13245,6 +13344,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13460,6 +13560,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13476,6 +13577,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13691,6 +13793,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13707,6 +13810,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13922,6 +14026,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13938,6 +14043,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14153,6 +14259,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14169,6 +14276,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14384,6 +14492,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14400,6 +14509,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14615,6 +14725,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14631,6 +14742,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14846,6 +14958,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14862,6 +14975,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15077,6 +15191,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15093,6 +15208,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15308,6 +15424,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15324,6 +15441,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15539,6 +15657,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15555,6 +15674,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15770,6 +15890,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15786,6 +15907,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16001,6 +16123,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16017,6 +16140,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16232,6 +16356,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16248,6 +16373,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16495,6 +16621,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16515,6 +16642,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16768,6 +16896,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16788,6 +16917,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17041,6 +17171,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17061,6 +17192,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17314,6 +17446,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17334,6 +17467,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17587,6 +17721,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17607,6 +17742,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17860,6 +17996,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17880,6 +18017,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18133,6 +18271,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18153,6 +18292,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18406,6 +18546,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18426,6 +18567,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18679,6 +18821,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18699,6 +18842,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18952,6 +19096,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18972,6 +19117,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19225,6 +19371,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19245,6 +19392,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19498,6 +19646,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19518,6 +19667,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19771,6 +19921,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19791,6 +19942,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20044,6 +20196,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20064,6 +20217,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 8949e4b782f630..cd2c8176b8d33e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX12-WGP-LABEL: flat_workgroup_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX12-CU-LABEL: flat_workgroup_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX12-CU-LABEL: flat_workgroup_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -544,6 +548,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX12-WGP-LABEL: flat_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -562,6 +567,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX12-CU-LABEL: flat_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -755,6 +761,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -777,6 +784,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1519,6 +1527,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1531,6 +1540,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1687,6 +1697,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1701,6 +1712,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1854,6 +1866,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1871,6 +1884,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2040,6 +2054,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2059,6 +2074,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2229,6 +2245,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2248,6 +2265,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -3093,6 +3111,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3109,6 +3128,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3340,6 +3360,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3358,6 +3379,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3586,6 +3608,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3607,6 +3630,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3851,6 +3875,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3874,6 +3899,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4119,6 +4145,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4142,6 +4169,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4375,6 +4403,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4393,6 +4422,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4625,6 +4655,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4643,6 +4674,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4887,6 +4919,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4910,6 +4943,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5155,6 +5189,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5178,6 +5213,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5423,6 +5459,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5446,6 +5483,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5691,6 +5729,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5714,6 +5753,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5963,6 +6003,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5983,6 +6024,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6246,6 +6288,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6269,6 +6312,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6535,6 +6579,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6560,6 +6605,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6836,6 +6882,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6864,6 +6911,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7141,6 +7189,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7169,6 +7218,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7434,6 +7484,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7457,6 +7508,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7721,6 +7773,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7744,6 +7797,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8020,6 +8074,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8048,6 +8103,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8325,6 +8381,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8353,6 +8410,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8630,6 +8688,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8658,6 +8717,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8935,6 +8995,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8963,6 +9024,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9240,6 +9302,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9268,6 +9331,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9545,6 +9609,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9573,6 +9638,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9850,6 +9916,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9878,6 +9945,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10155,6 +10223,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10183,6 +10252,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10362,6 +10432,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10377,6 +10448,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10547,6 +10619,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10562,6 +10635,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10738,6 +10812,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10757,6 +10832,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10939,6 +11015,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10962,6 +11039,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11688,6 +11766,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11700,6 +11779,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11848,6 +11928,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11862,6 +11943,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12008,6 +12090,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12025,6 +12108,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12179,6 +12263,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12198,6 +12283,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12352,6 +12438,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12371,6 +12458,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13188,6 +13276,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13204,6 +13293,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13427,6 +13517,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13445,6 +13536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13666,6 +13758,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13687,6 +13780,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13916,6 +14010,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13939,6 +14034,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14168,6 +14264,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14191,6 +14288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14414,6 +14512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14432,6 +14531,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14655,6 +14755,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14673,6 +14774,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14902,6 +15004,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14925,6 +15028,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15154,6 +15258,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15177,6 +15282,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15406,6 +15512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15429,6 +15536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15658,6 +15766,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15681,6 +15790,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15910,6 +16020,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15933,6 +16044,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16162,6 +16274,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16185,6 +16298,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16414,6 +16528,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16437,6 +16552,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16666,6 +16782,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16689,6 +16806,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16936,6 +17054,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16956,6 +17075,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17215,6 +17335,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17239,6 +17360,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17498,6 +17620,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17523,6 +17646,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17788,6 +17912,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17817,6 +17942,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18082,6 +18208,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18111,6 +18238,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18370,6 +18498,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18394,6 +18523,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18653,6 +18783,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18677,6 +18808,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18942,6 +19074,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18971,6 +19104,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19236,6 +19370,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19265,6 +19400,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19530,6 +19666,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19559,6 +19696,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19824,6 +19962,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19853,6 +19992,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20118,6 +20258,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20147,6 +20288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20412,6 +20554,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20441,6 +20584,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20706,6 +20850,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20735,6 +20880,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21000,6 +21146,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21029,6 +21176,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index b56860991b1948..4ba64af63e5f52 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX12-WGP-LABEL: global_agent_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX12-CU-LABEL: global_agent_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX12-WGP-LABEL: global_agent_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX12-CU-LABEL: global_agent_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -588,6 +592,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX12-WGP-LABEL: global_agent_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -603,6 +608,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX12-CU-LABEL: global_agent_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -811,6 +817,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-WGP-LABEL: global_agent_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -830,6 +837,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-LABEL: global_agent_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -995,6 +1003,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-WGP-LABEL: global_agent_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1006,6 +1015,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-LABEL: global_agent_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1162,6 +1172,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-WGP-LABEL: global_agent_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1173,6 +1184,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-LABEL: global_agent_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1346,6 +1358,7 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-WGP-LABEL: global_agent_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1362,6 +1375,7 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-LABEL: global_agent_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1540,6 +1554,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-WGP-LABEL: global_agent_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1556,6 +1571,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-LABEL: global_agent_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3395,6 +3411,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3410,6 +3427,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3644,6 +3662,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3661,6 +3680,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3889,6 +3909,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3909,6 +3930,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4165,6 +4187,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4187,6 +4210,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4445,6 +4469,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4467,6 +4492,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4708,6 +4734,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4725,6 +4752,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4961,6 +4989,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4978,6 +5007,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5231,6 +5261,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5253,6 +5284,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5511,6 +5543,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5533,6 +5566,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5791,6 +5825,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5813,6 +5848,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6071,6 +6107,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6093,6 +6130,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6351,6 +6389,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6373,6 +6412,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6631,6 +6671,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6653,6 +6694,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6911,6 +6953,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6933,6 +6976,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7191,6 +7235,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7213,6 +7258,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7454,6 +7500,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7471,6 +7518,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7724,6 +7772,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7744,6 +7793,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8002,6 +8052,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8024,6 +8075,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8299,6 +8351,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8324,6 +8377,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8602,6 +8656,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8627,6 +8682,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8888,6 +8944,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8908,6 +8965,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9164,6 +9222,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9184,6 +9243,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9457,6 +9517,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9482,6 +9543,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9760,6 +9822,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9785,6 +9848,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10063,6 +10127,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10088,6 +10153,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10366,6 +10432,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10391,6 +10458,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10669,6 +10737,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10694,6 +10763,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10972,6 +11042,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10997,6 +11068,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11275,6 +11347,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11300,6 +11373,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11578,6 +11652,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11603,6 +11678,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11802,6 +11878,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX12-WGP-LABEL: global_agent_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11814,6 +11891,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX12-CU-LABEL: global_agent_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11998,6 +12076,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12010,6 +12089,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12208,6 +12288,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12223,6 +12304,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX12-CU-LABEL: global_agent_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12431,6 +12513,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12450,6 +12533,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12615,6 +12699,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-WGP-LABEL: global_agent_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12626,6 +12711,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-LABEL: global_agent_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12782,6 +12868,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12793,6 +12880,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12966,6 +13054,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-WGP-LABEL: global_agent_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12982,6 +13071,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-LABEL: global_agent_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -13160,6 +13250,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -13176,6 +13267,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -15015,6 +15107,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15030,6 +15123,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15264,6 +15358,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15281,6 +15376,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15509,6 +15605,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15529,6 +15626,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15785,6 +15883,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15807,6 +15906,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16065,6 +16165,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16087,6 +16188,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16328,6 +16430,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16345,6 +16448,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16581,6 +16685,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16598,6 +16703,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16851,6 +16957,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16873,6 +16980,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17131,6 +17239,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17153,6 +17262,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17521,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17433,6 +17544,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17691,6 +17803,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17713,6 +17826,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17971,6 +18085,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17993,6 +18108,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18251,6 +18367,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18273,6 +18390,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18531,6 +18649,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18553,6 +18672,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18811,6 +18931,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18833,6 +18954,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19074,6 +19196,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19091,6 +19214,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19344,6 +19468,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19364,6 +19489,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19637,6 +19763,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19662,6 +19789,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19940,6 +20068,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19965,6 +20094,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20226,6 +20356,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20246,6 +20377,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20502,6 +20634,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20522,6 +20655,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20795,6 +20929,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20820,6 +20955,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21098,6 +21234,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21123,6 +21260,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21401,6 +21539,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21426,6 +21565,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21704,6 +21844,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21729,6 +21870,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22007,6 +22149,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22032,6 +22175,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22310,6 +22454,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22335,6 +22480,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22613,6 +22759,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22638,6 +22785,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22916,6 +23064,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22941,6 +23090,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index 7a9cb992a0cd16..0fc3212b0f46d9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr
; GFX12-LABEL: global_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -25,13 +26,16 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr
; GFX12-LABEL: global_last_use_load_1:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v0
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-NEXT: s_mov_b32 s4, 2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
@@ -50,6 +54,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
; GFX12-LABEL: global_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -70,13 +75,16 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1)
; GFX12-LABEL: global_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v0
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-NEXT: s_mov_b32 s4, 2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 9b2b3a4cfa9bae..14f1734235673a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -178,6 +178,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX12-WGP-LABEL: global_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -191,6 +192,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX12-CU-LABEL: global_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -437,13 +439,16 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX12-WGP-LABEL: global_nontemporal_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-WGP-NEXT: s_mov_b32 s4, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
@@ -454,13 +459,16 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX12-CU-LABEL: global_nontemporal_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-CU-NEXT: s_mov_b32 s4, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
@@ -641,6 +649,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX12-WGP-LABEL: global_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -654,6 +663,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX12-CU-LABEL: global_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -881,13 +891,16 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX12-WGP-LABEL: global_nontemporal_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
@@ -897,13 +910,16 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX12-CU-LABEL: global_nontemporal_store_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3
; GFX12-CU-NEXT: s_mov_b32 s3, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
@@ -1087,6 +1103,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -1101,6 +1118,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-LABEL: global_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index afc46fbc23a67a..33aaeebf658dd6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX12-WGP-LABEL: global_singlethread_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX12-CU-LABEL: global_singlethread_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX12-WGP-LABEL: global_singlethread_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX12-CU-LABEL: global_singlethread_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -574,6 +578,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX12-WGP-LABEL: global_singlethread_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -586,6 +591,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX12-CU-LABEL: global_singlethread_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -770,6 +776,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -782,6 +789,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX12-CU-LABEL: global_singlethread_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -940,6 +948,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-WGP-LABEL: global_singlethread_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -951,6 +960,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-LABEL: global_singlethread_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1107,6 +1117,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-WGP-LABEL: global_singlethread_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1118,6 +1129,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-LABEL: global_singlethread_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1274,6 +1286,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-WGP-LABEL: global_singlethread_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1285,6 +1298,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-LABEL: global_singlethread_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1441,6 +1455,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1452,6 +1467,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-LABEL: global_singlethread_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3004,6 +3020,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3019,6 +3036,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3228,6 +3246,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3243,6 +3262,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3452,6 +3472,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3467,6 +3488,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3698,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3691,6 +3714,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3900,6 +3924,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3915,6 +3940,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4124,6 +4150,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4139,6 +4166,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4348,6 +4376,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4363,6 +4392,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4572,6 +4602,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4587,6 +4618,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4796,6 +4828,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4811,6 +4844,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5020,6 +5054,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5035,6 +5070,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5244,6 +5280,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5259,6 +5296,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5468,6 +5506,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5483,6 +5522,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5692,6 +5732,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5707,6 +5748,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5916,6 +5958,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5931,6 +5974,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6140,6 +6184,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6200,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6389,6 +6435,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6406,6 +6453,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6644,6 +6692,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6661,6 +6710,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6899,6 +6949,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6916,6 +6967,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7154,6 +7206,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7171,6 +7224,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7409,6 +7463,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7426,6 +7481,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7664,6 +7720,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7681,6 +7738,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7919,6 +7977,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7936,6 +7995,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8174,6 +8234,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8191,6 +8252,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8429,6 +8491,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8446,6 +8509,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8684,6 +8748,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8701,6 +8766,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8939,6 +9005,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8956,6 +9023,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9194,6 +9262,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9211,6 +9280,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9449,6 +9519,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9466,6 +9537,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9704,6 +9776,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9721,6 +9794,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9959,6 +10033,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9976,6 +10051,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10167,6 +10243,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10179,6 +10256,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX12-CU-LABEL: global_singlethread_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10363,6 +10441,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10375,6 +10454,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10559,6 +10639,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10571,6 +10652,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10755,6 +10837,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10767,6 +10850,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10925,6 +11009,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -10936,6 +11021,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-LABEL: global_singlethread_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11092,6 +11178,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11103,6 +11190,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11259,6 +11347,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-WGP-LABEL: global_singlethread_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11270,6 +11359,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-LABEL: global_singlethread_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11426,6 +11516,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11437,6 +11528,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12989,6 +13081,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13004,6 +13097,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13213,6 +13307,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13228,6 +13323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13437,6 +13533,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13452,6 +13549,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13661,6 +13759,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13676,6 +13775,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13885,6 +13985,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13900,6 +14001,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14109,6 +14211,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14124,6 +14227,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14333,6 +14437,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14348,6 +14453,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14557,6 +14663,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14572,6 +14679,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14781,6 +14889,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14796,6 +14905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15005,6 +15115,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15020,6 +15131,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15229,6 +15341,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15244,6 +15357,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15453,6 +15567,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15468,6 +15583,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15677,6 +15793,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15692,6 +15809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15901,6 +16019,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16035,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16125,6 +16245,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16140,6 +16261,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16374,6 +16496,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16391,6 +16514,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16753,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16646,6 +16771,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16884,6 +17010,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16901,6 +17028,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17139,6 +17267,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17156,6 +17285,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17394,6 +17524,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17542,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17649,6 +17781,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17666,6 +17799,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17904,6 +18038,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17921,6 +18056,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18159,6 +18295,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18176,6 +18313,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18414,6 +18552,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18431,6 +18570,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18669,6 +18809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18827,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18924,6 +19066,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18941,6 +19084,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19179,6 +19323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19196,6 +19341,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19434,6 +19580,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19451,6 +19598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19689,6 +19837,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19706,6 +19855,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19944,6 +20094,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19961,6 +20112,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 62a4f3b43b2dcd..2c877755019cef 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX12-WGP-LABEL: global_system_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX12-CU-LABEL: global_system_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX12-WGP-LABEL: global_system_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX12-CU-LABEL: global_system_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -590,6 +594,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX12-WGP-LABEL: global_system_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -605,6 +610,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX12-CU-LABEL: global_system_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -815,6 +821,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-WGP-LABEL: global_system_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -834,6 +841,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-LABEL: global_system_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -999,6 +1007,7 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-WGP-LABEL: global_system_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1010,6 +1019,7 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-LABEL: global_system_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1166,6 +1176,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-WGP-LABEL: global_system_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1177,6 +1188,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-LABEL: global_system_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1352,6 +1364,7 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-WGP-LABEL: global_system_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1368,6 +1381,7 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-LABEL: global_system_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1548,6 +1562,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-WGP-LABEL: global_system_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1564,6 +1579,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-LABEL: global_system_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3425,6 +3441,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3440,6 +3457,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3694,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3693,6 +3712,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3923,6 +3943,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3943,6 +3964,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4203,6 +4225,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4225,6 +4248,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4487,6 +4511,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4509,6 +4534,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4752,6 +4778,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4769,6 +4796,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5007,6 +5035,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5024,6 +5053,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5281,6 +5311,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5303,6 +5334,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5565,6 +5597,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5587,6 +5620,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5849,6 +5883,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5871,6 +5906,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6133,6 +6169,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6192,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6396,6 +6434,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6413,6 +6452,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6668,6 +6708,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6688,6 +6729,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6965,6 +7007,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6990,6 +7033,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7272,6 +7316,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7297,6 +7342,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7560,6 +7606,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7580,6 +7627,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7838,6 +7886,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7858,6 +7907,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8135,6 +8185,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8160,6 +8211,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8442,6 +8494,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8467,6 +8520,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8749,6 +8803,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8774,6 +8829,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9056,6 +9112,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9081,6 +9138,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9363,6 +9421,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9388,6 +9447,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9670,6 +9730,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9695,6 +9756,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9977,6 +10039,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10002,6 +10065,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10284,6 +10348,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10309,6 +10374,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10508,6 +10574,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX12-WGP-LABEL: global_system_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10520,6 +10587,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX12-CU-LABEL: global_system_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10704,6 +10772,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10716,6 +10785,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX12-CU-LABEL: global_system_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10916,6 +10986,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX12-WGP-LABEL: global_system_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10931,6 +11002,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX12-CU-LABEL: global_system_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11141,6 +11213,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11160,6 +11233,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11325,6 +11399,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-WGP-LABEL: global_system_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11336,6 +11411,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-LABEL: global_system_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11492,6 +11568,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11503,6 +11580,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-LABEL: global_system_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11678,6 +11756,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-WGP-LABEL: global_system_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11694,6 +11773,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-LABEL: global_system_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11874,6 +11954,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11890,6 +11971,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -13751,6 +13833,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13766,6 +13849,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14002,6 +14086,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14019,6 +14104,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14249,6 +14335,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14269,6 +14356,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14529,6 +14617,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14551,6 +14640,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14835,6 +14926,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15078,6 +15170,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15095,6 +15188,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15333,6 +15427,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15350,6 +15445,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15607,6 +15703,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15629,6 +15726,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15891,6 +15989,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15913,6 +16012,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16175,6 +16275,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16197,6 +16298,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16459,6 +16561,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16481,6 +16584,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16743,6 +16847,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16765,6 +16870,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17027,6 +17133,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17049,6 +17156,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17311,6 +17419,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17333,6 +17442,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17595,6 +17705,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17617,6 +17728,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17858,6 +17970,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17875,6 +17988,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18130,6 +18244,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18150,6 +18265,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18410,6 +18526,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18432,6 +18549,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18711,6 +18829,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18736,6 +18855,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19018,6 +19138,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19043,6 +19164,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19306,6 +19428,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19326,6 +19449,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19584,6 +19708,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19604,6 +19729,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19881,6 +20007,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19906,6 +20033,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20188,6 +20316,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20213,6 +20342,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20495,6 +20625,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20520,6 +20651,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20802,6 +20934,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20827,6 +20960,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21109,6 +21243,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21134,6 +21269,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21416,6 +21552,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21441,6 +21578,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21723,6 +21861,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21748,6 +21887,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22030,6 +22170,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22055,6 +22196,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index a98efb49b4b72b..692aee5f4b9eaf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -126,6 +126,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-WGP-LABEL: global_volatile_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -140,6 +141,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-LABEL: global_volatile_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -315,13 +317,16 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-WGP-LABEL: global_volatile_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-WGP-NEXT: s_mov_b32 s4, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS
@@ -334,13 +339,16 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-LABEL: global_volatile_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-CU-NEXT: s_mov_b32 s4, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS
@@ -474,6 +482,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX12-WGP-LABEL: global_volatile_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -493,6 +502,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX12-CU-LABEL: global_volatile_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -665,13 +675,16 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX12-WGP-LABEL: global_volatile_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
@@ -687,13 +700,16 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX12-CU-LABEL: global_volatile_store_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3
; GFX12-CU-NEXT: s_mov_b32 s3, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
@@ -833,6 +849,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX12-WGP-LABEL: global_volatile_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -848,6 +865,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX12-CU-LABEL: global_volatile_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -967,6 +985,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-WGP-LABEL: global_volatile_workgroup_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -983,6 +1002,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-LABEL: global_volatile_workgroup_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index f805e2cf37006c..aaa11c0455606f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX12-WGP-LABEL: global_wavefront_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX12-CU-LABEL: global_wavefront_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX12-WGP-LABEL: global_wavefront_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX12-CU-LABEL: global_wavefront_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -574,6 +578,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX12-WGP-LABEL: global_wavefront_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -586,6 +591,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX12-CU-LABEL: global_wavefront_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -770,6 +776,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -782,6 +789,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX12-CU-LABEL: global_wavefront_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -940,6 +948,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-WGP-LABEL: global_wavefront_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -951,6 +960,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-LABEL: global_wavefront_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1107,6 +1117,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-WGP-LABEL: global_wavefront_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1118,6 +1129,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-LABEL: global_wavefront_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1274,6 +1286,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-WGP-LABEL: global_wavefront_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1285,6 +1298,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-LABEL: global_wavefront_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1441,6 +1455,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1452,6 +1467,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-LABEL: global_wavefront_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3004,6 +3020,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3019,6 +3036,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3228,6 +3246,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3243,6 +3262,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3452,6 +3472,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3467,6 +3488,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3698,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3691,6 +3714,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3900,6 +3924,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3915,6 +3940,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4124,6 +4150,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4139,6 +4166,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4348,6 +4376,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4363,6 +4392,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4572,6 +4602,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4587,6 +4618,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4796,6 +4828,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4811,6 +4844,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5020,6 +5054,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5035,6 +5070,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5244,6 +5280,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5259,6 +5296,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5468,6 +5506,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5483,6 +5522,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5692,6 +5732,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5707,6 +5748,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5916,6 +5958,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5931,6 +5974,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6140,6 +6184,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6200,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6389,6 +6435,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6406,6 +6453,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6644,6 +6692,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6661,6 +6710,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6899,6 +6949,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6916,6 +6967,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7154,6 +7206,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7171,6 +7224,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7409,6 +7463,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7426,6 +7481,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7664,6 +7720,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7681,6 +7738,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7919,6 +7977,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7936,6 +7995,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8174,6 +8234,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8191,6 +8252,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8429,6 +8491,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8446,6 +8509,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8684,6 +8748,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8701,6 +8766,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8939,6 +9005,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8956,6 +9023,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9194,6 +9262,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9211,6 +9280,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9449,6 +9519,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9466,6 +9537,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9704,6 +9776,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9721,6 +9794,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9959,6 +10033,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9976,6 +10051,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10167,6 +10243,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10179,6 +10256,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX12-CU-LABEL: global_wavefront_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10363,6 +10441,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10375,6 +10454,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10559,6 +10639,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10571,6 +10652,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10755,6 +10837,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10767,6 +10850,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10925,6 +11009,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -10936,6 +11021,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-LABEL: global_wavefront_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11092,6 +11178,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11103,6 +11190,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11259,6 +11347,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11270,6 +11359,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-LABEL: global_wavefront_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11426,6 +11516,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11437,6 +11528,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12989,6 +13081,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13004,6 +13097,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13213,6 +13307,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13228,6 +13323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13437,6 +13533,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13452,6 +13549,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13661,6 +13759,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13676,6 +13775,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13885,6 +13985,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13900,6 +14001,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14109,6 +14211,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14124,6 +14227,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14333,6 +14437,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14348,6 +14453,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14557,6 +14663,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14572,6 +14679,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14781,6 +14889,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14796,6 +14905,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15005,6 +15115,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15020,6 +15131,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15229,6 +15341,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15244,6 +15357,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15453,6 +15567,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15468,6 +15583,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15677,6 +15793,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15692,6 +15809,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15901,6 +16019,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16035,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16125,6 +16245,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16140,6 +16261,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16374,6 +16496,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16391,6 +16514,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16753,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16646,6 +16771,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16884,6 +17010,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16901,6 +17028,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17139,6 +17267,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17156,6 +17285,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17394,6 +17524,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17542,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17649,6 +17781,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17666,6 +17799,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17904,6 +18038,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17921,6 +18056,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18159,6 +18295,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18176,6 +18313,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18414,6 +18552,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18431,6 +18570,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18669,6 +18809,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18827,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18924,6 +19066,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18941,6 +19084,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19179,6 +19323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19196,6 +19341,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19434,6 +19580,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19451,6 +19598,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19689,6 +19837,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19706,6 +19855,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19944,6 +20094,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19961,6 +20112,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 30bf4920715352..25c75aa50df091 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX12-WGP-LABEL: global_workgroup_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX12-CU-LABEL: global_workgroup_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX12-WGP-LABEL: global_workgroup_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX12-CU-LABEL: global_workgroup_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -578,6 +582,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX12-WGP-LABEL: global_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -593,6 +598,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX12-CU-LABEL: global_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -786,6 +792,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -805,6 +812,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-LABEL: global_workgroup_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -964,6 +972,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-WGP-LABEL: global_workgroup_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -975,6 +984,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-LABEL: global_workgroup_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1131,6 +1141,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-WGP-LABEL: global_workgroup_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1142,6 +1153,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-LABEL: global_workgroup_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1311,6 +1323,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-WGP-LABEL: global_workgroup_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1327,6 +1340,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-LABEL: global_workgroup_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1497,6 +1511,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1513,6 +1528,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-LABEL: global_workgroup_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3212,6 +3228,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3227,6 +3244,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3444,6 +3462,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3461,6 +3480,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3683,6 +3703,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3703,6 +3724,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3934,6 +3956,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3956,6 +3979,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4187,6 +4211,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4209,6 +4234,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4427,6 +4453,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4444,6 +4471,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4661,6 +4689,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4678,6 +4707,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4908,6 +4938,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4930,6 +4961,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5161,6 +5193,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5183,6 +5216,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5414,6 +5448,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5436,6 +5471,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5667,6 +5703,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5689,6 +5726,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5920,6 +5958,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5942,6 +5981,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6173,6 +6213,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6195,6 +6236,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6426,6 +6468,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6448,6 +6491,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6679,6 +6723,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6701,6 +6746,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6936,6 +6982,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6953,6 +7000,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7195,6 +7243,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7215,6 +7264,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7466,6 +7516,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7488,6 +7539,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7744,6 +7796,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7769,6 +7822,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8025,6 +8079,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8050,6 +8105,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8293,6 +8349,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8313,6 +8370,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8555,6 +8613,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8575,6 +8634,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8830,6 +8890,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8855,6 +8916,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9111,6 +9173,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9136,6 +9199,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9392,6 +9456,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9417,6 +9482,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9673,6 +9739,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9698,6 +9765,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9954,6 +10022,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9979,6 +10048,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10235,6 +10305,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10260,6 +10331,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10516,6 +10588,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10541,6 +10614,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10797,6 +10871,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10822,6 +10897,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11014,6 +11090,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11026,6 +11103,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX12-CU-LABEL: global_workgroup_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11210,6 +11288,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11222,6 +11301,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11410,6 +11490,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11425,6 +11506,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11615,6 +11697,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11634,6 +11717,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11792,6 +11876,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11803,6 +11888,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-LABEL: global_workgroup_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11959,6 +12045,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11970,6 +12057,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12132,6 +12220,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12148,6 +12237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-LABEL: global_workgroup_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12310,6 +12400,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12326,6 +12417,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -13984,6 +14076,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13999,6 +14092,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14216,6 +14310,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14233,6 +14328,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14448,6 +14544,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14468,6 +14565,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14691,6 +14789,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14713,6 +14812,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14936,6 +15036,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14958,6 +15059,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15175,6 +15277,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15192,6 +15295,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15409,6 +15513,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15426,6 +15531,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15649,6 +15755,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15671,6 +15778,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15894,6 +16002,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16025,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16139,6 +16249,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16161,6 +16272,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16384,6 +16496,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16406,6 +16519,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16743,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16651,6 +16766,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16874,6 +16990,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16896,6 +17013,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17119,6 +17237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17141,6 +17260,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17364,6 +17484,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17386,6 +17507,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17620,6 +17742,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17637,6 +17760,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17879,6 +18003,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17899,6 +18024,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18143,6 +18269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18165,6 +18292,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18413,6 +18541,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18438,6 +18567,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18816,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18711,6 +18842,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18953,6 +19085,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18973,6 +19106,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19215,6 +19349,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19235,6 +19370,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19483,6 +19619,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19508,6 +19645,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19756,6 +19894,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19781,6 +19920,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20029,6 +20169,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20054,6 +20195,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20302,6 +20444,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20327,6 +20470,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20575,6 +20719,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20600,6 +20745,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20848,6 +20994,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20873,6 +21020,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21121,6 +21269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21146,6 +21295,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21394,6 +21544,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21419,6 +21570,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
index 67ca31a2bb84e6..1a2058cbe39e4e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 < %s 2>&1 | FileCheck %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s 2>&1 | FileCheck %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s 2>&1 | FileCheck %s
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 02cd97c9fe82a7..d925ca52f85600 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index ba9711333a1946..fce60ff12aed3d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -183,6 +183,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX12-WGP-LABEL: local_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -196,6 +197,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX12-CU-LABEL: local_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -420,13 +422,16 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-WGP-LABEL: local_nontemporal_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
@@ -437,13 +442,16 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-CU-LABEL: local_nontemporal_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
; GFX12-CU-NEXT: ds_load_b32 v1, v1
@@ -615,6 +623,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX12-WGP-LABEL: local_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -628,6 +637,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX12-CU-LABEL: local_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -826,8 +836,10 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1
; GFX12-WGP-NEXT: s_mov_b32 s1, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
@@ -841,8 +853,10 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1
; GFX12-CU-NEXT: s_mov_b32 s1, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
@@ -1027,6 +1041,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
; GFX12-WGP-LABEL: local_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1040,6 +1055,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
; GFX12-CU-LABEL: local_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index fe5f2c51734f7a..033c71574643cf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -2657,6 +2657,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2670,6 +2671,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2837,6 +2839,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2850,6 +2853,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3017,6 +3021,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3030,6 +3035,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3197,6 +3203,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3210,6 +3217,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3377,6 +3385,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3390,6 +3399,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3557,6 +3567,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3570,6 +3581,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3737,6 +3749,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3750,6 +3763,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3917,6 +3931,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3930,6 +3945,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4097,6 +4113,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4127,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4277,6 +4295,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4290,6 +4309,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4457,6 +4477,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4470,6 +4491,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4637,6 +4659,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4650,6 +4673,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4817,6 +4841,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4830,6 +4855,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4997,6 +5023,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5010,6 +5037,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5177,6 +5205,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5190,6 +5219,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5390,6 +5420,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5406,6 +5437,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5611,6 +5643,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5627,6 +5660,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5832,6 +5866,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5848,6 +5883,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6053,6 +6089,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6069,6 +6106,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6274,6 +6312,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6290,6 +6329,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6495,6 +6535,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6511,6 +6552,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6716,6 +6758,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6732,6 +6775,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6937,6 +6981,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6953,6 +6998,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7158,6 +7204,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7174,6 +7221,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7379,6 +7427,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7395,6 +7444,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7600,6 +7650,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7616,6 +7667,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7821,6 +7873,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7837,6 +7890,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8042,6 +8096,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8058,6 +8113,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8263,6 +8319,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8279,6 +8336,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8484,6 +8542,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8500,6 +8559,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -11165,6 +11225,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11178,6 +11239,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11345,6 +11407,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11358,6 +11421,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11525,6 +11589,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11538,6 +11603,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11705,6 +11771,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11718,6 +11785,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11885,6 +11953,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11898,6 +11967,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12065,6 +12135,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12078,6 +12149,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12245,6 +12317,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12258,6 +12331,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12425,6 +12499,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12438,6 +12513,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12605,6 +12681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12618,6 +12695,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12785,6 +12863,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12798,6 +12877,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12965,6 +13045,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12978,6 +13059,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13145,6 +13227,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13158,6 +13241,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13325,6 +13409,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13338,6 +13423,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13505,6 +13591,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13518,6 +13605,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13685,6 +13773,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13698,6 +13787,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13898,6 +13988,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -13914,6 +14005,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14119,6 +14211,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14135,6 +14228,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14340,6 +14434,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14356,6 +14451,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14561,6 +14657,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14577,6 +14674,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14782,6 +14880,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14798,6 +14897,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15003,6 +15103,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15019,6 +15120,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15224,6 +15326,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15240,6 +15343,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15445,6 +15549,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15461,6 +15566,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15666,6 +15772,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15682,6 +15789,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15887,6 +15995,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15903,6 +16012,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16108,6 +16218,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16124,6 +16235,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16329,6 +16441,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16345,6 +16458,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16550,6 +16664,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16566,6 +16681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16771,6 +16887,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16904,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16992,6 +17110,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17008,6 +17127,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 1c4c8d41b18f9d..548c5aceb25f74 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index a52dd9b3401696..a8f7051bd5050c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -123,6 +123,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX12-WGP-LABEL: local_volatile_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -136,6 +137,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX12-CU-LABEL: local_volatile_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -284,13 +286,16 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-WGP-LABEL: local_volatile_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
@@ -301,13 +306,16 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-CU-LABEL: local_volatile_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
; GFX12-CU-NEXT: ds_load_b32 v1, v1
@@ -423,6 +431,7 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-WGP-LABEL: local_volatile_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -441,6 +450,7 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-LABEL: local_volatile_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -576,8 +586,10 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1
; GFX12-WGP-NEXT: s_mov_b32 s1, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
@@ -596,8 +608,10 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1
; GFX12-CU-NEXT: s_mov_b32 s1, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index 02e4e0d69dc205..694ffb2964f569 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -2657,6 +2657,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2670,6 +2671,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2837,6 +2839,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2850,6 +2853,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3017,6 +3021,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3030,6 +3035,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3197,6 +3203,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3210,6 +3217,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3377,6 +3385,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3390,6 +3399,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3557,6 +3567,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3570,6 +3581,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3737,6 +3749,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3750,6 +3763,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3917,6 +3931,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3930,6 +3945,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4097,6 +4113,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4127,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4277,6 +4295,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4290,6 +4309,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4457,6 +4477,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4470,6 +4491,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4637,6 +4659,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4650,6 +4673,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4817,6 +4841,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4830,6 +4855,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4997,6 +5023,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5010,6 +5037,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5177,6 +5205,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5190,6 +5219,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5390,6 +5420,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5406,6 +5437,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5611,6 +5643,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5627,6 +5660,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5832,6 +5866,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5848,6 +5883,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6053,6 +6089,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6069,6 +6106,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6274,6 +6312,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6290,6 +6329,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6495,6 +6535,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6511,6 +6552,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6716,6 +6758,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6732,6 +6775,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6937,6 +6981,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6953,6 +6998,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7158,6 +7204,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7174,6 +7221,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7379,6 +7427,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7395,6 +7444,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7600,6 +7650,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7616,6 +7667,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7821,6 +7873,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7837,6 +7890,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8042,6 +8096,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8058,6 +8113,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8263,6 +8319,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8279,6 +8336,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8484,6 +8542,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8500,6 +8559,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -11165,6 +11225,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11178,6 +11239,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11345,6 +11407,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11358,6 +11421,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11525,6 +11589,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11538,6 +11603,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11705,6 +11771,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11718,6 +11785,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11885,6 +11953,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11898,6 +11967,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12065,6 +12135,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12078,6 +12149,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12245,6 +12317,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12258,6 +12331,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12425,6 +12499,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12438,6 +12513,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12605,6 +12681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12618,6 +12695,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12785,6 +12863,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12798,6 +12877,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12965,6 +13045,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12978,6 +13059,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13145,6 +13227,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13158,6 +13241,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13325,6 +13409,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13338,6 +13423,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13505,6 +13591,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13518,6 +13605,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13685,6 +13773,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13698,6 +13787,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13898,6 +13988,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -13914,6 +14005,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14119,6 +14211,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14135,6 +14228,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14340,6 +14434,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14356,6 +14451,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14561,6 +14657,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14577,6 +14674,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14782,6 +14880,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14798,6 +14897,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15003,6 +15103,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15019,6 +15120,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15224,6 +15326,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15240,6 +15343,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15445,6 +15549,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15461,6 +15566,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15666,6 +15772,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15682,6 +15789,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15887,6 +15995,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15903,6 +16012,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16108,6 +16218,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16124,6 +16235,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16329,6 +16441,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16345,6 +16458,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16550,6 +16664,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16566,6 +16681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16771,6 +16887,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16904,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16992,6 +17110,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17008,6 +17127,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index c2429632285378..0cf644c006facd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
index 61cec731feb565..8e292fa5929756 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr add
; GFX12-LABEL: private_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -24,13 +25,16 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add
; GFX12-LABEL: private_last_use_load_1:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v0
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-NEXT: s_mov_b32 s3, 2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_LU
@@ -49,6 +53,7 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %
; GFX12-LABEL: private_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -69,6 +74,7 @@ define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5
; GFX12-LABEL: private_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 4e08065e879fd8..c3599c87985bec 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -193,6 +193,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX12-WGP-LABEL: private_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -205,6 +206,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX12-CU-LABEL: private_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -442,13 +444,16 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX12-WGP-LABEL: private_nontemporal_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
@@ -459,13 +464,16 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX12-CU-LABEL: private_nontemporal_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-CU-NEXT: s_mov_b32 s3, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
@@ -648,6 +656,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX12-WGP-LABEL: private_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -660,6 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX12-CU-LABEL: private_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -868,13 +878,16 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX12-WGP-LABEL: private_nontemporal_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
@@ -884,13 +897,16 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX12-CU-LABEL: private_nontemporal_store_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
@@ -1085,6 +1101,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
; GFX12-WGP-LABEL: private_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1099,6 +1116,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
; GFX12-CU-LABEL: private_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index a68b5f36b806ed..9146f175eefcd1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -135,6 +135,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX12-WGP-LABEL: private_volatile_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -149,6 +150,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX12-CU-LABEL: private_volatile_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -312,13 +314,16 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX12-WGP-LABEL: private_volatile_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
@@ -331,13 +336,16 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX12-CU-LABEL: private_volatile_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-CU-NEXT: s_mov_b32 s3, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
@@ -475,6 +483,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX12-WGP-LABEL: private_volatile_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -493,6 +502,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX12-CU-LABEL: private_volatile_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -646,13 +656,16 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX12-WGP-LABEL: private_volatile_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
@@ -668,13 +681,16 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX12-CU-LABEL: private_volatile_store_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index bbbbc0dc0f28d6..a642543c3780db 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -11,8 +11,10 @@ define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
; GFX12-NEXT: s_add_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0
; GFX12-NEXT: v_s_exp_f32 s0, s0
-; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_f32 s0, s0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%result = call float @llvm.exp2.f32(float %src)
@@ -61,8 +63,10 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
; GFX12-NEXT: s_mul_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0
; GFX12-NEXT: v_s_log_f32 s0, s0
-; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%result = call float @llvm.log2.f32(float %src)
@@ -166,6 +170,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s1, s1, s0
; GFX12-SDAG-NEXT: v_s_sqrt_f32 s2, s1
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_mov_b32 s4, s1
; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_add_co_i32 s3, s2, -1
@@ -179,16 +184,19 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_xor_b32 s6, s4, 0x80000000
; GFX12-SDAG-NEXT: s_fmac_f32 s5, s6, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-SDAG-NEXT: s_cmp_gt_f32 s5, 0
; GFX12-SDAG-NEXT: s_cselect_b32 s2, s4, s3
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xf800000
; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000
; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_cselect_b32 s0, s1, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
@@ -200,6 +208,7 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_cselect_b32 s0, s2, s0
; GFX12-GISEL-NEXT: v_s_sqrt_f32 s2, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_mov_b32 s4, s0
; GFX12-GISEL-NEXT: s_mov_b32 s6, s0
; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -217,11 +226,11 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1
; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
; GFX12-GISEL-NEXT: ; return to shader part epilog
%result = call float @llvm.sqrt.f32(float %src)
@@ -270,10 +279,12 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
; GFX12-NEXT: s_mul_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_s_log_f32 s0, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sub_f32 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%abs = call float @llvm.fabs.f32(float %src)
@@ -291,8 +302,10 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_mul_f32 s0, s1, s0
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0
; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
@@ -304,10 +317,12 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%neg = fneg float %src
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 4ea77d1d1ac159..b7aecca45def5b 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -63,10 +63,11 @@ define void @test_remat_s_getpc_b64() {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v2, s30, 0
; GFX12-NEXT: s_getpc_b64 s[0:1]
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s1, s1
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ;;#ASMEND
@@ -74,16 +75,19 @@ define void @test_remat_s_getpc_b64() {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_getpc_b64 s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s1, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_readlane_b32 s31, v2, 1
; GFX12-NEXT: v_readlane_b32 s30, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i64 @llvm.amdgcn.s.getpc()
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
index 1f36f7a0d9616e..d4f7bf656d3b55 100644
--- a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
@@ -62,12 +62,14 @@ define half @swap(half %a, half %b, i32 %i) {
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX12-TRUE16-NEXT: v_swap_b16 v0.l, v0.h
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: swap:
@@ -80,17 +82,19 @@ define half @swap(half %a, half %b, i32 %i) {
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
; GFX12-FAKE16-NEXT: .LBB0_1: ; %loop
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
; GFX12-FAKE16-NEXT: v_swap_b32 v1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
entry:
br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index 65feaf23ae2cb3..ab222f4feeef0f 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -45,18 +45,12 @@
name: mask_hazard_getpc1
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_getpc1
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_getpc1
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_getpc1
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$sgpr0_sgpr1 = S_GETPC_B64
$sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
@@ -67,24 +61,15 @@ body: |
name: mask_hazard_getpc2
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_getpc2
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
- ; GFX11-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
- ; GFX11-NEXT: }
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_getpc2
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc
- ; GFX12-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc, implicit $scc
- ; GFX12-NEXT: }
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_getpc2
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+ ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
+ ; GCN-NEXT: }
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
BUNDLE implicit-def $sgpr0_sgpr1 {
$sgpr0_sgpr1 = S_GETPC_B64
@@ -523,18 +508,12 @@ body: |
name: mask_hazard_subreg4
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_subreg4
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- ; GFX11-NEXT: $vcc_lo = S_MOV_B32 0
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_subreg4
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0
- ; GFX12-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_subreg4
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GCN-NEXT: $vcc_lo = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$vcc_lo = S_MOV_B32 0
$sgpr2 = S_MOV_B32 $vcc_lo
@@ -546,18 +525,12 @@ body: |
name: mask_hazard_subreg5
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_subreg5
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- ; GFX11-NEXT: $vcc_hi = S_MOV_B32 0
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_subreg5
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0
- ; GFX12-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_subreg5
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GCN-NEXT: $vcc_hi = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$vcc_hi = S_MOV_B32 0
$sgpr2 = S_MOV_B32 $vcc_hi
@@ -569,20 +542,13 @@ body: |
name: mask_hazard_waitcnt
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_waitcnt
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: S_WAITCNT 0
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_waitcnt
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: S_WAITCNT 0
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_waitcnt
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
S_WAITCNT 0
$sgpr0_sgpr1 = S_GETPC_B64
@@ -595,22 +561,14 @@ body: |
name: mask_hazard_gap1
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_gap1
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
- ; GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_gap1
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
- ; GFX12-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_gap1
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
$vgpr3 = V_MOV_B32_e32 0, implicit $exec
@@ -624,20 +582,13 @@ body: |
name: mask_hazard_gap2
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_gap2
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_gap2
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_gap2
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
$sgpr0_sgpr1 = S_GETPC_B64
@@ -650,20 +601,13 @@ body: |
name: mask_hazard_gap3
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_gap3
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_gap3
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_gap3
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
$sgpr0_sgpr1 = S_GETPC_B64
diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
new file mode 100644
index 00000000000000..2fd8353280f0f1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
@@ -0,0 +1,862 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s
+# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s
+
+--- |
+ @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>]
+
+ define amdgpu_gs void @hazard_getpc1() { ret void }
+ define amdgpu_gs void @hazard_getpc2() { ret void }
+ define amdgpu_gs void @hazard_getpc3() { ret void }
+ define amdgpu_gs void @hazard_getpc4() { ret void }
+ define amdgpu_gs void @hazard_vcc1() { ret void }
+ define amdgpu_gs void @hazard_vcc2() { ret void }
+ define amdgpu_gs void @hazard_vcc3() { ret void }
+ define amdgpu_gs void @hazard_addc1() { ret void }
+ define amdgpu_gs void @hazard_addc2() { ret void }
+ define amdgpu_gs void @hazard_addc3() { ret void }
+ define amdgpu_gs void @hazard_addc4() { ret void }
+ define amdgpu_gs void @hazard_addc5() { ret void }
+ define amdgpu_gs void @hazard_addc6() { ret void }
+ define amdgpu_gs void @hazard_vaddc1() { ret void }
+ define amdgpu_gs void @hazard_gap1() { ret void }
+ define amdgpu_gs void @hazard_gap2() { ret void }
+ define amdgpu_gs void @hazard_gap3() { ret void }
+ define amdgpu_gs void @hazard_gap4_no_hazard() { ret void }
+ define amdgpu_gs void @hazard_valu_write1_no_hazard() { ret void }
+ define amdgpu_gs void @hazard_post_order1() { ret void }
+ define amdgpu_gs void @hazard_post_order2() { ret void }
+ define amdgpu_gs void @hazard_post_order_cycle() { ret void }
+ define amdgpu_cs void @hazard_calls() { ret void }
+...
+
+---
+name: hazard_getpc1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_getpc1
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_getpc1
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_getpc2
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_getpc2
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_getpc2
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_getpc3
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_getpc3
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 20, implicit-def $scc, implicit $scc
+ ; GCN-O0-NEXT: }
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_getpc3
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc
+ ; GCN-O2-NEXT: }
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ BUNDLE implicit-def $sgpr0_sgpr1 {
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 12, implicit-def $scc, implicit $scc
+ }
+ S_ENDPGM 0
+...
+
+---
+name: hazard_getpc4
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_getpc4
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 28, implicit-def $scc, implicit $scc
+ ; GCN-O0-NEXT: }
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_getpc4
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1
+ ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 24, implicit-def $scc, implicit $scc
+ ; GCN-O2-NEXT: }
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ BUNDLE implicit-def $sgpr0_sgpr1 {
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr1 = S_SEXT_I32_I16 $sgpr1
+ $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc
+ }
+ S_ENDPGM 0
+...
+
+---
+name: hazard_vcc1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_vcc1
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+ ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_vcc1
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+ ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+ $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_vcc2
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_vcc2
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+ ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_vcc2
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+ ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_vcc3
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_vcc3
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+ ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_vcc3
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+ ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc1
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc1
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc2
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc2
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc2
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc3
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc3
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc3
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc4
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc4
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc4
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+ $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc5
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc5
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc5
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr16 = S_MOV_B32 0
+ $sgpr32 = S_MOV_B32 0
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc6
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc6
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr48 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr80 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr96 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc6
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr48 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr80 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr96 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr16 = S_MOV_B32 0
+ $sgpr32 = S_MOV_B32 0
+ $sgpr48 = S_MOV_B32 0
+ $sgpr80 = S_MOV_B32 0
+ $sgpr96 = S_MOV_B32 0
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_vaddc1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_vaddc1
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_vaddc1
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: hazard_gap1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_gap1
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_gap1
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_gap2
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_gap2
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_gap2
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_gap3
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_gap3
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_gap3
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_gap4_no_hazard
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_gap4_no_hazard
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_gap4_no_hazard
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+ $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+ $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_valu_write1_no_hazard
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_valu_write1_no_hazard
+ ; GCN-O0: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_valu_write1_no_hazard
+ ; GCN-O2: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_post_order1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_post_order1
+ ; GCN-O0: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_post_order1
+ ; GCN-O2: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: hazard_post_order2
+body: |
+ ; GCN-O0-LABEL: name: hazard_post_order2
+ ; GCN-O0: bb.0:
+ ; GCN-O0-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_BRANCH %bb.1
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.1:
+ ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_post_order2
+ ; GCN-O2: bb.0:
+ ; GCN-O2-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_BRANCH %bb.1
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.1:
+ ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ bb.0:
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: hazard_post_order_cycle
+body: |
+ ; GCN-O0-LABEL: name: hazard_post_order_cycle
+ ; GCN-O0: bb.0:
+ ; GCN-O0-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.1:
+ ; GCN-O0-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.2:
+ ; GCN-O0-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.3:
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_post_order_cycle
+ ; GCN-O2: bb.0:
+ ; GCN-O2-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.1:
+ ; GCN-O2-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.2:
+ ; GCN-O2-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.3:
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0
+
+ bb.1:
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+
+ bb.2:
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ S_CBRANCH_SCC0 %bb.1, implicit $scc
+
+ bb.3:
+ S_ENDPGM 0
+...
+
+---
+name: hazard_calls
+frameInfo:
+ hasCalls: true
+body: |
+ ; GCN-O0-LABEL: name: hazard_calls
+ ; GCN-O0: bb.0:
+ ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_SETPC_B64 $sgpr0_sgpr1
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.1:
+ ; GCN-O0-NEXT: $sgpr18 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_SETPC_B64_return $sgpr0_sgpr1
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.2:
+ ; GCN-O0-NEXT: successors: %bb.3(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $sgpr20 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.3:
+ ; GCN-O0-NEXT: successors: %bb.4(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.4:
+ ; GCN-O0-NEXT: $sgpr22 = S_MOV_B32 $sgpr8
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_calls
+ ; GCN-O2: bb.0:
+ ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O2-NEXT: S_SETPC_B64 $sgpr0_sgpr1
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.1:
+ ; GCN-O2-NEXT: $sgpr18 = S_MOV_B32 0
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: S_SETPC_B64_return $sgpr0_sgpr1
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.2:
+ ; GCN-O2-NEXT: successors: %bb.3(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: $sgpr20 = S_MOV_B32 0
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.3:
+ ; GCN-O2-NEXT: successors: %bb.4(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.4:
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr22 = S_MOV_B32 $sgpr8
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ bb.0:
+ $sgpr16 = S_MOV_B32 0
+ S_SETPC_B64 $sgpr0_sgpr1
+
+ bb.1:
+ $sgpr18 = S_MOV_B32 0
+ S_SETPC_B64_return $sgpr0_sgpr1
+
+ bb.2:
+ $sgpr20 = S_MOV_B32 0
+ $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+ $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+
+ bb.3:
+ $sgpr8_sgpr9 = S_CALL_B64 0
+
+ bb.4:
+ $sgpr22 = S_MOV_B32 $sgpr8
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
index 3f40a57ca1491e..e3b96c08348fcb 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
@@ -1,11 +1,12 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s
# GCN-LABEL: name: hazard_vcmpx_permlane16
# GCN: V_CMPX_LE_F32_nosdst_e32
# GCN: S_ADD_U32
# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
# GCN-NEXT: V_PERMLANE16_B32_e64
---
name: hazard_vcmpx_permlane16
@@ -128,6 +129,7 @@ body: |
# GCN: V_CMPX_LE_F32_nosdst_e32
# GCN: S_ADD_U32
# GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
# GCN-NEXT: V_PERMLANE16_B32_e64
---
name: hazard_vcmpx_permlane16_undef_src
@@ -150,6 +152,7 @@ body: |
# GCN: V_CMPX_LE_F32_nosdst_e64
# GCN: S_ADD_U32
# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
# GCN-NEXT: V_PERMLANE16_B32_e64
---
name: hazard_vcmpx_e64_permlane16
>From a6326c7de2ff7cdbee901851fac5e9a0c608c42a Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Wed, 24 Jul 2024 16:05:54 +0900
Subject: [PATCH 2/2] Address reviewer comments.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 29 ++++++++++---------
.../CodeGen/AMDGPU/valu-read-sgpr-hazard.mir | 4 +--
2 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index b2eabd2117f151..ff1d88794302cb 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2920,7 +2920,9 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
return true;
}
-static unsigned baseSGPRNumber(Register Reg, const SIRegisterInfo &TRI) {
+// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
+// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
+static unsigned sgprPairNumber(Register Reg, const SIRegisterInfo &TRI) {
unsigned RegN = TRI.getEncodingValue(Reg);
assert(RegN <= 127);
return (RegN >> 1) & 0x3f;
@@ -2966,13 +2968,14 @@ void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
for (auto &MI : reverse(MBB->instrs())) {
bool IsVALU = SIInstrInfo::isVALU(MI);
bool IsSALU = SIInstrInfo::isSALU(MI);
- if (!(IsVALU || IsSALU))
+ if (!IsVALU && !IsSALU)
continue;
for (const MachineOperand &Op : MI.operands()) {
if (!Op.isReg())
continue;
Register Reg = Op.getReg();
+ assert(!Op.getSubReg());
// Only consider implicit operands of VCC.
if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
@@ -2981,7 +2984,7 @@ void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
continue;
if (TRI.getEncodingValue(Reg) >= SGPR_NULL)
continue;
- unsigned RegN = baseSGPRNumber(Reg, TRI);
+ unsigned RegN = sgprPairNumber(Reg, TRI);
if (IsVALU && Op.isUse()) {
// Note: any access within a cycle must be considered a hazard.
if (InCycle || (ReadSGPRs[RegN] && SALUWriteSGPRs[RegN]))
@@ -3055,10 +3058,9 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
// All SGPR writes before a call/return must be flushed as the callee/caller
// will not will not see the hazard chain, i.e. (2) to (3) described above.
- const bool IsSetPC = (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
- MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
- MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
- MI->getOpcode() == AMDGPU::S_CALL_B64);
+ const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
+ !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
+ MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
// Collect all SGPR sources for MI which are read by a VALU.
const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
@@ -3081,7 +3083,7 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
continue;
- unsigned RegN = baseSGPRNumber(OpReg, TRI);
+ unsigned RegN = sgprPairNumber(OpReg, TRI);
if (!VALUReadHazardSGPRs[RegN])
continue;
@@ -3102,7 +3104,7 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (IsSetPC && I.getNumDefs() > 0)
return true;
// Check for any register writes.
- return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
+ return any_of(SGPRsUsed, [this, &I](Register Reg) {
return I.modifiesRegister(Reg, &TRI);
});
};
@@ -3123,9 +3125,8 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
return 0;
// SALU must be unrelated to any hazard registers.
- if (llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
- return I.readsRegister(Reg, &TRI);
- }))
+ if (any_of(SGPRsUsed,
+ [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
return 0;
return 1;
};
@@ -3147,14 +3148,14 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
return Register(AMDGPU::VCC);
// TODO: handle TTMP?
- return Register(AMDGPU::SGPR0_SGPR1 + baseSGPRNumber(Reg, TRI));
+ return Register(AMDGPU::SGPR0_SGPR1 + sgprPairNumber(Reg, TRI));
};
auto SearchHazardFn = [this, hazardPair,
&SGPRsUsed](const MachineInstr &I) {
if (!SIInstrInfo::isVALU(I))
return false;
// Check for any register reads.
- return llvm::any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
+ return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
return I.readsRegister(hazardPair(Reg), &TRI);
});
};
diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
index 2fd8353280f0f1..2aa16dd9047665 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s
-# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s
+# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s
+# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s
--- |
@mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>]
More information about the llvm-commits
mailing list